This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(rpart) #partining of DT
library(caret) #To partition the data into test and training
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rpart.plot)
library(data.tree)
library(caTools) #Manipulation of data
library(ggplot2)
library(tidyr)
library(outliers)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: '/opt/X11/lib/libSM.6.dylib'
## Referenced from: '/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/modules/R_X11.so'
## Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-17.0.1+12/Contents/Home/lib/server/libSM.6.dylib' (no such file)
## tcltk DLL is linked to '/opt/X11/lib/libX11.6.dylib'
## Could not load tcltk. Will use slower R code instead.
## Loading required package: RSQLite
library(dlookr)
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:base':
##
## transform
library(corrplot)
## corrplot 0.92 loaded
library(aqp)
## This is aqp 1.42
##
## Attaching package: 'aqp'
## The following objects are masked from 'package:dplyr':
##
## combine, slice
library(soilDB)
library('pROC')
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ROCR)
library("randomForest")
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:aqp':
##
## combine
## The following object is masked from 'package:outliers':
##
## outlier
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(RColorBrewer)
df_testX <- read.csv("testX.csv", header = FALSE)
df_testY <- read.csv("testY.csv", header = FALSE)
df_trainX <- read.csv("trainX.csv", header = FALSE)
df_trainY <- read.csv("trainY.csv", header = FALSE)
head(df_testX)
head(df_testY)
head(df_trainX)
head(df_trainY)
names(df_testX) <- c('radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean',
'concavity_mean','concave_points_mean','symmetry_mean','fractal_dimension_mean','radius_se',
'texure_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se',
'concave_points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst',
'perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst',
'concave_points_worst','symmetry_worst','fractal_dimension_worst')
names(df_trainX) <- c('radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean',
'concavity_mean','concave_points_mean','symmetry_mean','fractal_dimension_mean','radius_se',
'texure_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se',
'concave_points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst',
'perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst',
'concave_points_worst','symmetry_worst','fractal_dimension_worst')
## Using `cbind()` function to join df_testX and df_textY and df_trainX and df_trainY
df_test <- cbind(df_testX,df_testY)
df_train <- cbind(df_trainX,df_trainY)
df_test
df_train
df_test <- df_test %>% rename(diagnosis = V1)
df_train <- df_train %>% rename(diagnosis = V1)
#structure of dataframe
str(df_test)
## 'data.frame': 57 obs. of 31 variables:
## $ radius_mean : num 13.4 12.2 14.8 14.6 14.6 ...
## $ texture_mean : num 30.7 20.5 17.7 15.2 23.3 ...
## $ perimeter_mean : num 86.3 77.2 95.9 95.8 94 ...
## $ area_mean : num 557 459 675 652 665 ...
## $ smoothness_mean : num 0.0925 0.0801 0.0918 0.1132 0.0868 ...
## $ compactness_mean : num 0.0743 0.0404 0.0889 0.1339 0.0664 ...
## $ concavity_mean : num 0.0282 0.0238 0.0407 0.0997 0.0839 ...
## $ concave_points_mean : num 0.0326 0.0177 0.0226 0.0706 0.0527 ...
## $ symmetry_mean : num 0.138 0.174 0.189 0.212 0.163 ...
## $ fractal_dimension_mean : num 0.0602 0.0568 0.0589 0.0635 0.0542 ...
## $ radius_se : num 0.341 0.192 0.22 0.511 0.416 ...
## $ texure_se : num 1.924 1.571 0.622 0.737 1.627 ...
## $ perimeter_se : num 2.29 1.18 1.48 3.81 2.91 ...
## $ area_se : num 28.9 14.7 19.8 42.8 33 ...
## $ smoothness_se : num 0.00584 0.00508 0.0048 0.00551 0.00831 ...
## $ compactness_se : num 0.0125 0.0061 0.0117 0.0441 0.0174 ...
## $ concavity_se : num 0.00794 0.01069 0.01758 0.04436 0.03389 ...
## $ concave_points_se : num 0.00913 0.0068 0.0069 0.01623 0.01576 ...
## $ symmetry_se : num 0.0156 0.0145 0.0225 0.0243 0.0174 ...
## $ fractal_dimension_se : num 0.00298 0.00153 0.00197 0.00484 0.00287 ...
## $ radius_worst : num 15.1 13.3 16.4 16.3 15.8 ...
## $ texture_worst : num 41.6 32.8 22.7 18.2 31.7 ...
## $ perimeter_worst : num 96.7 84.6 105.9 109.4 102.2 ...
## $ area_worst : num 706 548 830 804 758 ...
## $ smoothness_worst : num 0.117 0.112 0.123 0.128 0.131 ...
## $ compactness_worst : num 0.1421 0.0886 0.1881 0.3089 0.1581 ...
## $ concavity_worst : num 0.07 0.115 0.206 0.26 0.268 ...
## $ concave_points_worst : num 0.0776 0.0743 0.0831 0.1397 0.1359 ...
## $ symmetry_worst : num 0.22 0.269 0.36 0.315 0.248 ...
## $ fractal_dimension_worst: num 0.0767 0.0688 0.0728 0.0847 0.0684 ...
## $ diagnosis : int 0 0 0 0 1 1 0 1 0 1 ...
str(df_train)
## 'data.frame': 455 obs. of 31 variables:
## $ radius_mean : num 12.8 12.9 17.9 19.2 13.7 ...
## $ texture_mean : num 22.3 13.3 24.5 26.6 15.2 ...
## $ perimeter_mean : num 85.3 82.8 115.2 126.2 88.3 ...
## $ area_mean : num 503 505 999 1138 581 ...
## $ smoothness_mean : num 0.1088 0.1134 0.0886 0.102 0.0827 ...
## $ compactness_mean : num 0.1799 0.0883 0.0703 0.1453 0.0755 ...
## $ concavity_mean : num 0.1695 0.038 0.057 0.1921 0.0425 ...
## $ concave_points_mean : num 0.0686 0.034 0.0474 0.0966 0.0247 ...
## $ symmetry_mean : num 0.212 0.154 0.154 0.19 0.179 ...
## $ fractal_dimension_mean : num 0.0725 0.0648 0.0551 0.0622 0.059 ...
## $ radius_se : num 0.306 0.221 0.421 0.636 0.14 ...
## $ texure_se : num 1.069 1.042 1.433 1.001 0.542 ...
## $ perimeter_se : num 2.26 1.61 2.77 4.32 1.1 ...
## $ area_se : num 25.1 16.6 45.8 69.7 11.3 ...
## $ smoothness_se : num 0.00698 0.00591 0.00544 0.00739 0.00521 ...
## $ compactness_se : num 0.0386 0.0202 0.0117 0.0245 0.0298 ...
## $ concavity_se : num 0.0468 0.019 0.0162 0.0399 0.0244 ...
## $ concave_points_se : num 0.01499 0.01011 0.00852 0.01293 0.00836 ...
## $ symmetry_se : num 0.0168 0.012 0.0142 0.0143 0.0182 ...
## $ fractal_dimension_se : num 0.00562 0.00311 0.00275 0.00345 0.00487 ...
## $ radius_worst : num 15.2 14 20.9 23.7 14.5 ...
## $ texture_worst : num 30.1 21.1 34.7 35.9 19.6 ...
## $ perimeter_worst : num 105.3 92.8 135.1 159.8 98 ...
## $ area_worst : num 706 600 1320 1724 657 ...
## $ smoothness_worst : num 0.178 0.155 0.132 0.178 0.128 ...
## $ compactness_worst : num 0.534 0.223 0.181 0.384 0.31 ...
## $ concavity_worst : num 0.628 0.179 0.208 0.575 0.257 ...
## $ concave_points_worst : num 0.198 0.116 0.114 0.187 0.105 ...
## $ symmetry_worst : num 0.341 0.238 0.25 0.326 0.339 ...
## $ fractal_dimension_worst: num 0.1243 0.0855 0.0795 0.0972 0.0964 ...
## $ diagnosis : int 1 0 1 1 0 1 0 0 1 0 ...
#Findings : There are 31 fields with 57 rows in train dataset
# There are 31 fields with 455 rows in train dataset
#summary
summary(df_test)
## radius_mean texture_mean perimeter_mean area_mean
## Min. : 8.597 Min. :10.38 Min. : 54.09 Min. : 221.2
## 1st Qu.:11.890 1st Qu.:15.24 1st Qu.: 77.22 1st Qu.: 440.6
## Median :13.530 Median :18.61 Median : 87.91 Median : 565.4
## Mean :14.357 Mean :18.65 Mean : 93.53 Mean : 674.0
## 3rd Qu.:17.140 3rd Qu.:20.58 3rd Qu.:115.00 3rd Qu.: 912.7
## Max. :22.270 Max. :31.12 Max. :152.80 Max. :1509.0
## smoothness_mean compactness_mean concavity_mean concave_points_mean
## Min. :0.07026 Min. :0.03212 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.08713 1st Qu.:0.06136 1st Qu.:0.02819 1st Qu.:0.02260
## Median :0.09831 Median :0.09752 Median :0.06636 Median :0.03438
## Mean :0.09808 Mean :0.10531 Mean :0.09124 Mean :0.05327
## 3rd Qu.:0.10740 3rd Qu.:0.13100 3rd Qu.:0.13350 3rd Qu.:0.08293
## Max. :0.13260 Max. :0.27760 Max. :0.42640 Max. :0.18230
## symmetry_mean fractal_dimension_mean radius_se texure_se
## Min. :0.1342 Min. :0.05395 Min. :0.1312 Min. :0.3602
## 1st Qu.:0.1619 1st Qu.:0.05886 1st Qu.:0.2522 1st Qu.:0.8570
## Median :0.1792 Median :0.06140 Median :0.3478 Median :1.0380
## Mean :0.1859 Mean :0.06248 Mean :0.4503 Mean :1.1721
## 3rd Qu.:0.1973 3rd Qu.:0.06491 3rd Qu.:0.5449 3rd Qu.:1.4750
## Max. :0.3040 Max. :0.07871 Max. :1.2150 Max. :2.7770
## perimeter_se area_se smoothness_se compactness_se
## Min. : 1.107 Min. : 9.438 Min. :0.003271 Min. :0.004711
## 1st Qu.: 1.696 1st Qu.: 18.950 1st Qu.:0.004796 1st Qu.:0.012700
## Median : 2.567 Median : 31.010 Median :0.005841 Median :0.018850
## Mean : 3.152 Mean : 46.000 Mean :0.006489 Mean :0.022830
## 3rd Qu.: 3.814 3rd Qu.: 50.960 3rd Qu.:0.006662 3rd Qu.:0.030290
## Max. :10.050 Max. :199.700 Max. :0.020750 Max. :0.086680
## concavity_se concave_points_se symmetry_se fractal_dimension_se
## Min. :0.00000 Min. :0.00000 Min. :0.01057 Min. :0.0009502
## 1st Qu.:0.01390 1st Qu.:0.00842 1st Qu.:0.01447 1st Qu.:0.0022680
## Median :0.02636 Median :0.01069 Median :0.01731 Median :0.0029850
## Mean :0.02761 Mean :0.01130 Mean :0.02005 Mean :0.0034004
## 3rd Qu.:0.03437 3rd Qu.:0.01365 3rd Qu.:0.02370 3rd Qu.:0.0042250
## Max. :0.10400 Max. :0.02480 Max. :0.06146 Max. :0.0074440
## radius_worst texture_worst perimeter_worst area_worst
## Min. : 8.952 Min. :12.49 Min. : 56.65 Min. : 240.1
## 1st Qu.:13.340 1st Qu.:20.14 1st Qu.: 85.10 1st Qu.: 547.8
## Median :15.750 Median :24.62 Median :102.50 Median : 758.2
## Mean :16.967 Mean :25.28 Mean :112.20 Mean : 964.9
## 3rd Qu.:20.010 3rd Qu.:28.39 3rd Qu.:134.90 3rd Qu.:1227.0
## Max. :32.490 Max. :47.16 Max. :214.00 Max. :3432.0
## smoothness_worst compactness_worst concavity_worst concave_points_worst
## Min. :0.08484 Min. :0.05332 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.11370 1st Qu.:0.14210 1st Qu.:0.1091 1st Qu.:0.07407
## Median :0.13470 Median :0.23020 Median :0.2604 Median :0.11380
## Mean :0.13204 Mean :0.25247 Mean :0.2771 Mean :0.12033
## 3rd Qu.:0.14780 3rd Qu.:0.33090 3rd Qu.:0.3779 3rd Qu.:0.16420
## Max. :0.18510 Max. :0.69970 Max. :0.9608 Max. :0.29100
## symmetry_worst fractal_dimension_worst diagnosis
## Min. :0.1890 Min. :0.06037 Min. :0.0000
## 1st Qu.:0.2542 1st Qu.:0.07191 1st Qu.:0.0000
## Median :0.2889 Median :0.07875 Median :0.0000
## Mean :0.2950 Mean :0.08268 Mean :0.4386
## 3rd Qu.:0.3216 3rd Qu.:0.08579 3rd Qu.:1.0000
## Max. :0.4761 Max. :0.14020 Max. :1.0000
summary(df_train)
## radius_mean texture_mean perimeter_mean area_mean
## Min. : 6.981 Min. : 9.71 Min. : 43.79 Min. : 143.5
## 1st Qu.:11.615 1st Qu.:16.21 1st Qu.: 74.70 1st Qu.: 412.6
## Median :13.280 Median :18.83 Median : 85.98 Median : 545.2
## Mean :14.105 Mean :19.39 Mean : 91.86 Mean : 654.6
## 3rd Qu.:15.815 3rd Qu.:21.93 3rd Qu.:103.75 3rd Qu.: 785.6
## Max. :28.110 Max. :39.28 Max. :188.50 Max. :2501.0
## smoothness_mean compactness_mean concavity_mean concave_points_mean
## Min. :0.05263 Min. :0.01938 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.08618 1st Qu.:0.06616 1st Qu.:0.03041 1st Qu.:0.01988
## Median :0.09578 Median :0.09453 Median :0.06476 Median :0.03390
## Mean :0.09623 Mean :0.10527 Mean :0.09028 Mean :0.04900
## 3rd Qu.:0.10470 3rd Qu.:0.13060 3rd Qu.:0.13220 3rd Qu.:0.07402
## Max. :0.16340 Max. :0.34540 Max. :0.42680 Max. :0.20120
## symmetry_mean fractal_dimension_mean radius_se texure_se
## Min. :0.1060 Min. :0.04996 Min. :0.1115 Min. :0.3621
## 1st Qu.:0.1626 1st Qu.:0.05799 1st Qu.:0.2321 1st Qu.:0.8281
## Median :0.1799 Median :0.06183 Median :0.3163 Median :1.1080
## Mean :0.1809 Mean :0.06301 Mean :0.4020 Mean :1.2274
## 3rd Qu.:0.1958 3rd Qu.:0.06639 3rd Qu.:0.4695 3rd Qu.:1.4795
## Max. :0.2906 Max. :0.09744 Max. :2.8730 Max. :4.8850
## perimeter_se area_se smoothness_se compactness_se
## Min. : 0.757 Min. : 6.802 Min. :0.001713 Min. :0.002252
## 1st Qu.: 1.645 1st Qu.: 17.670 1st Qu.:0.005228 1st Qu.:0.013710
## Median : 2.279 Median : 23.940 Median :0.006458 Median :0.021150
## Mean : 2.856 Mean : 40.172 Mean :0.007162 Mean :0.026047
## 3rd Qu.: 3.307 3rd Qu.: 44.935 3rd Qu.:0.008370 3rd Qu.:0.033065
## Max. :21.980 Max. :542.200 Max. :0.031130 Max. :0.135400
## concavity_se concave_points_se symmetry_se fractal_dimension_se
## Min. :0.00000 Min. :0.000000 Min. :0.007882 Min. :0.0008948
## 1st Qu.:0.01574 1st Qu.:0.007759 1st Qu.:0.015220 1st Qu.:0.0022795
## Median :0.02626 Median :0.011030 Median :0.018970 Median :0.0032370
## Mean :0.03288 Mean :0.011887 Mean :0.020687 Mean :0.0038707
## 3rd Qu.:0.04290 3rd Qu.:0.014960 3rd Qu.:0.023705 3rd Qu.:0.0045715
## Max. :0.39600 Max. :0.052790 Max. :0.078950 Max. :0.0298400
## radius_worst texture_worst perimeter_worst area_worst
## Min. : 7.93 Min. :12.02 Min. : 50.41 Min. : 185.2
## 1st Qu.:12.97 1st Qu.:21.09 1st Qu.: 83.80 1st Qu.: 510.1
## Median :14.90 Median :25.44 Median : 97.58 Median : 677.3
## Mean :16.22 Mean :25.78 Mean :106.95 Mean : 877.5
## 3rd Qu.:18.66 3rd Qu.:29.99 3rd Qu.:125.65 3rd Qu.:1057.0
## Max. :36.04 Max. :49.54 Max. :251.20 Max. :4254.0
## smoothness_worst compactness_worst concavity_worst concave_points_worst
## Min. :0.07117 Min. :0.02729 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.11735 1st Qu.:0.14860 1st Qu.:0.1203 1st Qu.:0.06326
## Median :0.13120 Median :0.21700 Median :0.2299 Median :0.10170
## Mean :0.13278 Mean :0.25871 Mean :0.2769 Mean :0.11502
## 3rd Qu.:0.14635 3rd Qu.:0.34305 3rd Qu.:0.3900 3rd Qu.:0.16650
## Max. :0.22260 Max. :1.05800 Max. :1.2520 Max. :0.29030
## symmetry_worst fractal_dimension_worst diagnosis
## Min. :0.1565 Min. :0.05504 Min. :0.0000
## 1st Qu.:0.2510 1st Qu.:0.07224 1st Qu.:0.0000
## Median :0.2826 Median :0.08052 Median :0.0000
## Mean :0.2905 Mean :0.08464 Mean :0.3714
## 3rd Qu.:0.3181 3rd Qu.:0.09219 3rd Qu.:1.0000
## Max. :0.6638 Max. :0.20750 Max. :1.0000
# Finding the missing values in the dataset using colSums
colSums(is.na(df_train))
## radius_mean texture_mean perimeter_mean
## 0 0 0
## area_mean smoothness_mean compactness_mean
## 0 0 0
## concavity_mean concave_points_mean symmetry_mean
## 0 0 0
## fractal_dimension_mean radius_se texure_se
## 0 0 0
## perimeter_se area_se smoothness_se
## 0 0 0
## compactness_se concavity_se concave_points_se
## 0 0 0
## symmetry_se fractal_dimension_se radius_worst
## 0 0 0
## texture_worst perimeter_worst area_worst
## 0 0 0
## smoothness_worst compactness_worst concavity_worst
## 0 0 0
## concave_points_worst symmetry_worst fractal_dimension_worst
## 0 0 0
## diagnosis
## 0
#Findings : There are no missing records in the train dataset
# Finding outliers and treating them
# Plotting Boxplot to find the outliers
boxplot(df_train,las=3.8,main = "Outlier detection of all columns using box plot")
## Selecting z-score over inter-quartile range because data has lot of outliers and if we use inter_quartile range then we would be removing around 236 records due to this dataset population would decrease dramatically.
z_scores <- as.data.frame(sapply(df_train, function(df_train) (abs(df_train-mean(df_train))/sd(df_train))))
Final_train_data <- df_train[!rowSums(z_scores>3), ]
dim(Final_train_data)
## [1] 399 31
# Removed 56 outliers using Z-score method
boxplot(Final_train_data,las=3.8,main = "Box plot after outlier treatment")
# Plot Result vs having_Sub_Domain
ggplot(Final_train_data, aes(x=perimeter_worst, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Calculate correlation matrix
df_correlationMatrix <- cor(Final_train_data)
# summarize the correlation matrix
print(df_correlationMatrix)
## radius_mean texture_mean perimeter_mean area_mean
## radius_mean 1.00000000 0.29856413 0.99817835 0.99173530
## texture_mean 0.29856413 1.00000000 0.30626837 0.30701145
## perimeter_mean 0.99817835 0.30626837 1.00000000 0.99015205
## area_mean 0.99173530 0.30701145 0.99015205 1.00000000
## smoothness_mean 0.13376026 0.01609664 0.16644041 0.13640053
## compactness_mean 0.54769702 0.26927860 0.59331539 0.53738059
## concavity_mean 0.71814375 0.32849031 0.75273868 0.72163561
## concave_points_mean 0.82573094 0.29966970 0.85043581 0.82870480
## symmetry_mean 0.15554889 0.13111279 0.18248339 0.16404398
## fractal_dimension_mean -0.37279201 -0.04865916 -0.33021776 -0.35327903
## radius_se 0.67725904 0.35237177 0.68549184 0.71377702
## texure_se -0.13701178 0.42803985 -0.13149297 -0.11063807
## perimeter_se 0.66755582 0.36335223 0.68297762 0.70036076
## area_se 0.80536420 0.34784770 0.81099324 0.84108241
## smoothness_se -0.33266277 0.06621983 -0.31716814 -0.28714960
## compactness_se 0.25885063 0.27828229 0.29990389 0.25752191
## concavity_se 0.32694032 0.25838760 0.36100574 0.32829398
## concave_points_se 0.47213735 0.22149589 0.49645088 0.46547252
## symmetry_se -0.20514422 0.05334888 -0.19389509 -0.17716663
## fractal_dimension_se 0.00128676 0.17422342 0.03803676 0.01729457
## radius_worst 0.97376574 0.32550845 0.97455488 0.97406425
## texture_worst 0.28330870 0.90202913 0.29103924 0.28856895
## perimeter_worst 0.96834318 0.33651958 0.97402953 0.96746873
## area_worst 0.95230774 0.32819465 0.95297343 0.96870871
## smoothness_worst 0.10743610 0.09365363 0.13651894 0.11563836
## compactness_worst 0.46727031 0.27447500 0.50668065 0.44898117
## concavity_worst 0.58706785 0.29574520 0.62067646 0.57623805
## concave_points_worst 0.76855143 0.27514011 0.79205244 0.75561850
## symmetry_worst 0.20385254 0.12281898 0.22299758 0.19866426
## fractal_dimension_worst 0.05627572 0.14152280 0.09667917 0.05555742
## diagnosis 0.74832674 0.41974758 0.76302074 0.75127424
## smoothness_mean compactness_mean concavity_mean
## radius_mean 0.13376026 0.54769702 0.718143754
## texture_mean 0.01609664 0.26927860 0.328490307
## perimeter_mean 0.16644041 0.59331539 0.752738684
## area_mean 0.13640053 0.53738059 0.721635613
## smoothness_mean 1.00000000 0.61654103 0.479016759
## compactness_mean 0.61654103 1.00000000 0.887872826
## concavity_mean 0.47901676 0.88787283 1.000000000
## concave_points_mean 0.51812644 0.82729890 0.930057825
## symmetry_mean 0.50148833 0.52226317 0.439731159
## fractal_dimension_mean 0.56621360 0.44377231 0.186401719
## radius_se 0.26259442 0.49384227 0.626276910
## texure_se 0.10664792 -0.01797803 0.004139446
## perimeter_se 0.26027918 0.56123077 0.670709175
## area_se 0.22522831 0.52184522 0.685899256
## smoothness_se 0.33964515 0.03648579 -0.013929824
## compactness_se 0.24985970 0.73965861 0.651563854
## concavity_se 0.22795699 0.67372541 0.730272740
## concave_points_se 0.38192674 0.64872500 0.677019705
## symmetry_se 0.10512381 0.03089304 -0.012267449
## fractal_dimension_se 0.29305441 0.53867407 0.419301686
## radius_worst 0.18101717 0.58130638 0.738278198
## texture_worst 0.07238727 0.27443635 0.328585638
## perimeter_worst 0.20136965 0.63378538 0.776144268
## area_worst 0.17818463 0.55890080 0.727269163
## smoothness_worst 0.79921862 0.54674159 0.446708473
## compactness_worst 0.39395246 0.86935871 0.774610519
## concavity_worst 0.37140357 0.82984199 0.894080298
## concave_points_worst 0.45500368 0.81543006 0.872795667
## symmetry_worst 0.34213190 0.45516597 0.379008749
## fractal_dimension_worst 0.44875880 0.66738446 0.492555443
## diagnosis 0.34236224 0.63724181 0.762824154
## concave_points_mean symmetry_mean
## radius_mean 0.82573094 0.1555489
## texture_mean 0.29966970 0.1311128
## perimeter_mean 0.85043581 0.1824834
## area_mean 0.82870480 0.1640440
## smoothness_mean 0.51812644 0.5014883
## compactness_mean 0.82729890 0.5222632
## concavity_mean 0.93005782 0.4397312
## concave_points_mean 1.00000000 0.4212773
## symmetry_mean 0.42127731 1.0000000
## fractal_dimension_mean 0.06054732 0.3495092
## radius_se 0.70507350 0.3178219
## texure_se -0.01165410 0.1303349
## perimeter_se 0.72606721 0.3256623
## area_se 0.77510997 0.2743277
## smoothness_se -0.05177695 0.1831325
## compactness_se 0.48727029 0.3432753
## concavity_se 0.52849556 0.3013606
## concave_points_se 0.67831316 0.3232982
## symmetry_se -0.03456125 0.3076929
## fractal_dimension_se 0.27389979 0.2810543
## radius_worst 0.83834671 0.2055676
## texture_worst 0.30117295 0.1560837
## perimeter_worst 0.85985741 0.2282504
## area_worst 0.82570144 0.2052586
## smoothness_worst 0.43401875 0.4004411
## compactness_worst 0.66240146 0.4079059
## concavity_worst 0.75858220 0.3744382
## concave_points_worst 0.90900531 0.3785374
## symmetry_worst 0.36001092 0.6697614
## fractal_dimension_worst 0.34934609 0.3675820
## diagnosis 0.80525149 0.3475161
## fractal_dimension_mean radius_se texure_se
## radius_mean -0.37279201 0.67725904 -0.137011775
## texture_mean -0.04865916 0.35237177 0.428039853
## perimeter_mean -0.33021776 0.68549184 -0.131492968
## area_mean -0.35327903 0.71377702 -0.110638070
## smoothness_mean 0.56621360 0.26259442 0.106647921
## compactness_mean 0.44377231 0.49384227 -0.017978031
## concavity_mean 0.18640172 0.62627691 0.004139446
## concave_points_mean 0.06054732 0.70507350 -0.011654100
## symmetry_mean 0.34950922 0.31782187 0.130334923
## fractal_dimension_mean 1.00000000 -0.10122740 0.127282395
## radius_se -0.10122740 1.00000000 0.240504653
## texure_se 0.12728239 0.24050465 1.000000000
## perimeter_se -0.05044046 0.96404109 0.255426741
## area_se -0.18140107 0.96328091 0.124427757
## smoothness_se 0.42599486 0.12530663 0.467731449
## compactness_se 0.42657153 0.37069401 0.186754131
## concavity_se 0.29251085 0.39367806 0.136010860
## concave_points_se 0.15762242 0.57187840 0.223141738
## symmetry_se 0.23283881 0.14974169 0.411281718
## fractal_dimension_se 0.64524127 0.24219863 0.271576115
## radius_worst -0.31200858 0.72481796 -0.139816340
## texture_worst -0.02616985 0.27763734 0.485285882
## perimeter_worst -0.27021279 0.71834787 -0.137716374
## area_worst -0.29233858 0.74809351 -0.114746735
## smoothness_worst 0.50468702 0.13938535 -0.045475869
## compactness_worst 0.34815821 0.30298133 -0.154398023
## concavity_worst 0.20832354 0.39850021 -0.144569668
## concave_points_worst 0.06444937 0.53477396 -0.151641575
## symmetry_worst 0.23301162 0.10630343 -0.158795232
## fractal_dimension_worst 0.71506685 0.05077695 -0.105501248
## diagnosis -0.06009576 0.63629440 -0.013588861
## perimeter_se area_se smoothness_se compactness_se
## radius_mean 0.66755582 0.80536420 -0.332662766 0.2588506
## texture_mean 0.36335223 0.34784770 0.066219825 0.2782823
## perimeter_mean 0.68297762 0.81099324 -0.317168143 0.2999039
## area_mean 0.70036076 0.84108241 -0.287149602 0.2575219
## smoothness_mean 0.26027918 0.22522831 0.339645152 0.2498597
## compactness_mean 0.56123077 0.52184522 0.036485793 0.7396586
## concavity_mean 0.67070918 0.68589926 -0.013929824 0.6515639
## concave_points_mean 0.72606721 0.77510997 -0.051776951 0.4872703
## symmetry_mean 0.32566232 0.27432766 0.183132548 0.3432753
## fractal_dimension_mean -0.05044046 -0.18140107 0.425994860 0.4265715
## radius_se 0.96404109 0.96328091 0.125306627 0.3706940
## texure_se 0.25542674 0.12442776 0.467731449 0.1867541
## perimeter_se 1.00000000 0.93095668 0.123076681 0.4841160
## area_se 0.93095668 1.00000000 -0.010214549 0.3485652
## smoothness_se 0.12307668 -0.01021455 1.000000000 0.1807416
## compactness_se 0.48411598 0.34856524 0.180741647 1.0000000
## concavity_se 0.48623021 0.39396851 0.129129693 0.8590665
## concave_points_se 0.64396494 0.55222101 0.258892205 0.6937252
## symmetry_se 0.18104921 0.04037709 0.475209628 0.2506622
## fractal_dimension_se 0.31271532 0.18288440 0.381157843 0.7927604
## radius_worst 0.70434288 0.84381849 -0.313820509 0.2648261
## texture_worst 0.28913016 0.29165535 -0.009665971 0.2227798
## perimeter_worst 0.72276195 0.83670895 -0.308243459 0.3241378
## area_worst 0.72310974 0.86982040 -0.270776390 0.2549233
## smoothness_worst 0.12803833 0.14008533 0.361912755 0.1992618
## compactness_worst 0.37989419 0.35887344 -0.132848737 0.7258262
## concavity_worst 0.45832552 0.47380259 -0.150700803 0.6522075
## concave_points_worst 0.57062456 0.62628689 -0.160635476 0.4979262
## symmetry_worst 0.11918570 0.13844031 -0.133034404 0.2146998
## fractal_dimension_worst 0.10134337 0.05955569 0.073112787 0.5817629
## diagnosis 0.63145566 0.70031135 -0.096387398 0.3329372
## concavity_se concave_points_se symmetry_se
## radius_mean 0.3269403 0.47213735 -0.20514422
## texture_mean 0.2583876 0.22149589 0.05334888
## perimeter_mean 0.3610057 0.49645088 -0.19389509
## area_mean 0.3282940 0.46547252 -0.17716663
## smoothness_mean 0.2279570 0.38192674 0.10512381
## compactness_mean 0.6737254 0.64872500 0.03089304
## concavity_mean 0.7302727 0.67701970 -0.01226745
## concave_points_mean 0.5284956 0.67831316 -0.03456125
## symmetry_mean 0.3013606 0.32329823 0.30769287
## fractal_dimension_mean 0.2925108 0.15762242 0.23283881
## radius_se 0.3936781 0.57187840 0.14974169
## texure_se 0.1360109 0.22314174 0.41128172
## perimeter_se 0.4862302 0.64396494 0.18104921
## area_se 0.3939685 0.55222101 0.04037709
## smoothness_se 0.1291297 0.25889221 0.47520963
## compactness_se 0.8590665 0.69372523 0.25066225
## concavity_se 1.0000000 0.73789343 0.17402301
## concave_points_se 0.7378934 1.00000000 0.22558549
## symmetry_se 0.1740230 0.22558549 1.00000000
## fractal_dimension_se 0.6372759 0.50323067 0.31501285
## radius_worst 0.3233943 0.44433584 -0.22123516
## texture_worst 0.2091099 0.14910835 -0.05210785
## perimeter_worst 0.3730850 0.47603255 -0.21020433
## area_worst 0.3160164 0.42758720 -0.19798475
## smoothness_worst 0.1904671 0.23363776 -0.09460989
## compactness_worst 0.6246958 0.47689613 -0.12766680
## concavity_worst 0.7400518 0.53897008 -0.16383553
## concave_points_worst 0.5389444 0.65965359 -0.18202915
## symmetry_worst 0.1698343 0.08453104 0.22092402
## fractal_dimension_worst 0.4468984 0.23880144 -0.06890816
## diagnosis 0.3759069 0.47234209 -0.09437891
## fractal_dimension_se radius_worst texture_worst
## radius_mean 0.00128676 0.97376574 0.283308705
## texture_mean 0.17422342 0.32550845 0.902029129
## perimeter_mean 0.03803676 0.97455488 0.291039235
## area_mean 0.01729457 0.97406425 0.288568951
## smoothness_mean 0.29305441 0.18101717 0.072387266
## compactness_mean 0.53867407 0.58130638 0.274436348
## concavity_mean 0.41930169 0.73827820 0.328585638
## concave_points_mean 0.27389979 0.83834671 0.301172947
## symmetry_mean 0.28105430 0.20556765 0.156083686
## fractal_dimension_mean 0.64524127 -0.31200858 -0.026169854
## radius_se 0.24219863 0.72481796 0.277637340
## texure_se 0.27157611 -0.13981634 0.485285882
## perimeter_se 0.31271532 0.70434288 0.289130158
## area_se 0.18288440 0.84381849 0.291655354
## smoothness_se 0.38115784 -0.31382051 -0.009665971
## compactness_se 0.79276036 0.26482614 0.222779785
## concavity_se 0.63727587 0.32339431 0.209109864
## concave_points_se 0.50323067 0.44433584 0.149108345
## symmetry_se 0.31501285 -0.22123516 -0.052107850
## fractal_dimension_se 1.00000000 0.01580814 0.108189333
## radius_worst 0.01580814 1.00000000 0.343374904
## texture_worst 0.10818933 0.34337490 1.000000000
## perimeter_worst 0.05806924 0.99316403 0.354852917
## area_worst 0.02730220 0.99021334 0.342605270
## smoothness_worst 0.21411449 0.20579053 0.229647855
## compactness_worst 0.46273053 0.52888749 0.339665011
## concavity_worst 0.38118907 0.63557524 0.353464226
## concave_points_worst 0.23775943 0.80653679 0.341987017
## symmetry_worst 0.08686780 0.29254533 0.260773895
## fractal_dimension_worst 0.66578174 0.14583905 0.227927621
## diagnosis 0.13135370 0.79610486 0.456522368
## perimeter_worst area_worst smoothness_worst
## radius_mean 0.96834318 0.9523077 0.10743610
## texture_mean 0.33651958 0.3281947 0.09365363
## perimeter_mean 0.97402953 0.9529734 0.13651894
## area_mean 0.96746873 0.9687087 0.11563836
## smoothness_mean 0.20136965 0.1781846 0.79921862
## compactness_mean 0.63378538 0.5589008 0.54674159
## concavity_mean 0.77614427 0.7272692 0.44670847
## concave_points_mean 0.85985741 0.8257014 0.43401875
## symmetry_mean 0.22825043 0.2052586 0.40044112
## fractal_dimension_mean -0.27021279 -0.2923386 0.50468702
## radius_se 0.71834787 0.7480935 0.13938535
## texure_se -0.13771637 -0.1147467 -0.04547587
## perimeter_se 0.72276195 0.7231097 0.12803833
## area_se 0.83670895 0.8698204 0.14008533
## smoothness_se -0.30824346 -0.2707764 0.36191275
## compactness_se 0.32413780 0.2549233 0.19926185
## concavity_se 0.37308505 0.3160164 0.19046709
## concave_points_se 0.47603255 0.4275872 0.23363776
## symmetry_se -0.21020433 -0.1979848 -0.09460989
## fractal_dimension_se 0.05806924 0.0273022 0.21411449
## radius_worst 0.99316403 0.9902133 0.20579053
## texture_worst 0.35485292 0.3426053 0.22964785
## perimeter_worst 1.00000000 0.9816986 0.22521379
## area_worst 0.98169860 1.0000000 0.20761800
## smoothness_worst 0.22521379 0.2076180 1.00000000
## compactness_worst 0.58626168 0.5016880 0.50787738
## concavity_worst 0.68208152 0.6138833 0.48824322
## concave_points_worst 0.83480860 0.7801895 0.51770861
## symmetry_worst 0.31385489 0.2776926 0.47626752
## fractal_dimension_worst 0.19373637 0.1409382 0.59349636
## diagnosis 0.80641786 0.7838651 0.40584392
## compactness_worst concavity_worst concave_points_worst
## radius_mean 0.4672703 0.5870679 0.76855143
## texture_mean 0.2744750 0.2957452 0.27514011
## perimeter_mean 0.5066806 0.6206765 0.79205244
## area_mean 0.4489812 0.5762380 0.75561850
## smoothness_mean 0.3939525 0.3714036 0.45500368
## compactness_mean 0.8693587 0.8298420 0.81543006
## concavity_mean 0.7746105 0.8940803 0.87279567
## concave_points_mean 0.6624015 0.7585822 0.90900531
## symmetry_mean 0.4079059 0.3744382 0.37853742
## fractal_dimension_mean 0.3481582 0.2083235 0.06444937
## radius_se 0.3029813 0.3985002 0.53477396
## texure_se -0.1543980 -0.1445697 -0.15164157
## perimeter_se 0.3798942 0.4583255 0.57062456
## area_se 0.3588734 0.4738026 0.62628689
## smoothness_se -0.1328487 -0.1507008 -0.16063548
## compactness_se 0.7258262 0.6522075 0.49792619
## concavity_se 0.6246958 0.7400518 0.53894441
## concave_points_se 0.4768961 0.5389701 0.65965359
## symmetry_se -0.1276668 -0.1638355 -0.18202915
## fractal_dimension_se 0.4627305 0.3811891 0.23775943
## radius_worst 0.5288875 0.6355752 0.80653679
## texture_worst 0.3396650 0.3534642 0.34198702
## perimeter_worst 0.5862617 0.6820815 0.83480860
## area_worst 0.5016880 0.6138833 0.78018948
## smoothness_worst 0.5078774 0.4882432 0.51770861
## compactness_worst 1.0000000 0.9034638 0.79541707
## concavity_worst 0.9034638 1.0000000 0.85826168
## concave_points_worst 0.7954171 0.8582617 1.00000000
## symmetry_worst 0.5630911 0.4896662 0.47616247
## fractal_dimension_worst 0.7891382 0.6529336 0.49065397
## diagnosis 0.5996528 0.6848074 0.79417733
## symmetry_worst fractal_dimension_worst diagnosis
## radius_mean 0.20385254 0.05627572 0.74832674
## texture_mean 0.12281898 0.14152280 0.41974758
## perimeter_mean 0.22299758 0.09667917 0.76302074
## area_mean 0.19866426 0.05555742 0.75127424
## smoothness_mean 0.34213190 0.44875880 0.34236224
## compactness_mean 0.45516597 0.66738446 0.63724181
## concavity_mean 0.37900875 0.49255544 0.76282415
## concave_points_mean 0.36001092 0.34934609 0.80525149
## symmetry_mean 0.66976137 0.36758201 0.34751614
## fractal_dimension_mean 0.23301162 0.71506685 -0.06009576
## radius_se 0.10630343 0.05077695 0.63629440
## texure_se -0.15879523 -0.10550125 -0.01358886
## perimeter_se 0.11918570 0.10134337 0.63145566
## area_se 0.13844031 0.05955569 0.70031135
## smoothness_se -0.13303440 0.07311279 -0.09638740
## compactness_se 0.21469980 0.58176286 0.33293716
## concavity_se 0.16983426 0.44689842 0.37590686
## concave_points_se 0.08453104 0.23880144 0.47234209
## symmetry_se 0.22092402 -0.06890816 -0.09437891
## fractal_dimension_se 0.08686780 0.66578174 0.13135370
## radius_worst 0.29254533 0.14583905 0.79610486
## texture_worst 0.26077389 0.22792762 0.45652237
## perimeter_worst 0.31385489 0.19373637 0.80641786
## area_worst 0.27769261 0.14093816 0.78386515
## smoothness_worst 0.47626752 0.59349636 0.40584392
## compactness_worst 0.56309108 0.78913824 0.59965280
## concavity_worst 0.48966619 0.65293362 0.68480742
## concave_points_worst 0.47616247 0.49065397 0.79417733
## symmetry_worst 1.00000000 0.50360062 0.41106337
## fractal_dimension_worst 0.50360062 1.00000000 0.32916951
## diagnosis 0.41106337 0.32916951 1.00000000
#Plot correlation matrix
corrplot(df_correlationMatrix, type = "upper",order = "hclust",col=brewer.pal(n=8,name= "RdYlBu"),tl.cex=0.5)
# From the output we can see that columns perimeter_worst, concave_points_mean, concave_poitns_worst,Texture_mean,area_mean,radius_worst,area_worst are highly correlated with diagnosis column
# find attributes that are highly corrected (ideally >0.75)
df_highlyCorrelated <- findCorrelation(df_correlationMatrix, cutoff=0.8,verbose = TRUE)
## Compare row 7 and column 28 with corr 0.873
## Means: 0.571 vs 0.407 so flagging column 7
## Compare row 28 and column 8 with corr 0.909
## Means: 0.554 vs 0.396 so flagging column 28
## Compare row 8 and column 6 with corr 0.827
## Means: 0.539 vs 0.386 so flagging column 8
## Compare row 6 and column 27 with corr 0.83
## Means: 0.513 vs 0.375 so flagging column 6
## Compare row 23 and column 21 with corr 0.993
## Means: 0.51 vs 0.364 so flagging column 23
## Compare row 27 and column 26 with corr 0.903
## Means: 0.478 vs 0.354 so flagging column 27
## Compare row 21 and column 24 with corr 0.99
## Means: 0.469 vs 0.343 so flagging column 21
## Compare row 24 and column 3 with corr 0.953
## Means: 0.438 vs 0.333 so flagging column 24
## Compare row 3 and column 1 with corr 0.998
## Means: 0.409 vs 0.325 so flagging column 3
## Compare row 1 and column 4 with corr 0.992
## Means: 0.367 vs 0.318 so flagging column 1
## Compare row 4 and column 14 with corr 0.841
## Means: 0.336 vs 0.314 so flagging column 4
## Compare row 14 and column 13 with corr 0.931
## Means: 0.33 vs 0.311 so flagging column 14
## Compare row 13 and column 11 with corr 0.964
## Means: 0.339 vs 0.312 so flagging column 13
## Compare row 16 and column 17 with corr 0.859
## Means: 0.406 vs 0.306 so flagging column 16
## Compare row 22 and column 2 with corr 0.902
## Means: 0.248 vs 0.301 so flagging column 2
## All correlations <= 0.8
# print indexes of highly correlated attributes
print(df_highlyCorrelated)
## [1] 7 28 8 6 23 27 21 24 3 1 4 14 13 16 2
# Plot Diagnosis vs Perimeter_worst
ggplot(Final_train_data, aes(x=perimeter_worst, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot Diagnosis vs concave_points_mean
ggplot(Final_train_data, aes(x=concave_points_mean, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot Diagnosis vs concave_points_worst
ggplot(Final_train_data, aes(x=concave_points_worst, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot Diagnosis vs texture_mean
ggplot(Final_train_data, aes(x=texture_mean, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot Diagnosis vs area_mean
ggplot(Final_train_data, aes(x=area_mean, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot Diagnosis vs radius_worst
ggplot(Final_train_data, aes(x=radius_worst, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Final_train_data$diagnosis <- as.factor(Final_train_data$diagnosis) # Converting the column to a factor variable
df_test$diagnosis <- as.factor(df_test$diagnosis) #Converting the column to a factor variable
DT <- rpart(diagnosis ~ ., data=Final_train_data,parms = list(split="information") ,method="class")
summary(DT)
## Call:
## rpart(formula = diagnosis ~ ., data = Final_train_data, method = "class",
## parms = list(split = "information"))
## n= 399
##
## CP nsplit rel error xerror xstd
## 1 0.77272727 0 1.0000000 1.0000000 0.07120036
## 2 0.04924242 1 0.2272727 0.3636364 0.04922818
## 3 0.01515152 3 0.1287879 0.2651515 0.04280794
## 4 0.01000000 4 0.1136364 0.2651515 0.04280794
##
## Variable importance
## perimeter_worst area_worst radius_worst
## 16 14 14
## perimeter_mean area_mean radius_mean
## 13 12 12
## concave_points_worst concavity_mean concave_points_mean
## 4 3 2
## compactness_mean texture_worst texture_mean
## 2 2 1
## perimeter_se compactness_worst concavity_worst
## 1 1 1
## smoothness_worst
## 1
##
## Node number 1: 399 observations, complexity param=0.7727273
## predicted class=0 expected loss=0.3308271 P(node) =1
## class counts: 267 132
## probabilities: 0.669 0.331
## left son=2 (257 obs) right son=3 (142 obs)
## Primary splits:
## perimeter_worst < 107.2 to the left, improve=153.2803, (0 missing)
## concave_points_mean < 0.04892 to the left, improve=151.1599, (0 missing)
## radius_worst < 16.805 to the left, improve=146.7947, (0 missing)
## concave_points_worst < 0.14555 to the left, improve=145.5684, (0 missing)
## area_worst < 888.85 to the left, improve=145.0168, (0 missing)
## Surrogate splits:
## radius_worst < 16.205 to the left, agree=0.972, adj=0.923, (0 split)
## area_worst < 784.15 to the left, agree=0.972, adj=0.923, (0 split)
## perimeter_mean < 92.42 to the left, agree=0.940, adj=0.831, (0 split)
## area_mean < 632.8 to the left, agree=0.935, adj=0.817, (0 split)
## radius_mean < 14.15 to the left, agree=0.932, adj=0.810, (0 split)
##
## Node number 2: 257 observations, complexity param=0.01515152
## predicted class=0 expected loss=0.03891051 P(node) =0.6441103
## class counts: 247 10
## probabilities: 0.961 0.039
## left son=4 (243 obs) right son=5 (14 obs)
## Primary splits:
## concave_points_worst < 0.1352 to the left, improve=21.11549, (0 missing)
## smoothness_worst < 0.17725 to the left, improve=17.27677, (0 missing)
## concavity_mean < 0.09752 to the left, improve=16.77982, (0 missing)
## concave_points_mean < 0.05583 to the left, improve=16.39897, (0 missing)
## concavity_worst < 0.3967 to the left, improve=16.39897, (0 missing)
## Surrogate splits:
## compactness_mean < 0.1338 to the left, agree=0.969, adj=0.429, (0 split)
## concavity_mean < 0.11265 to the left, agree=0.969, adj=0.429, (0 split)
## smoothness_worst < 0.17725 to the left, agree=0.969, adj=0.429, (0 split)
## compactness_worst < 0.3932 to the left, agree=0.969, adj=0.429, (0 split)
## concavity_worst < 0.3967 to the left, agree=0.969, adj=0.429, (0 split)
##
## Node number 3: 142 observations, complexity param=0.04924242
## predicted class=1 expected loss=0.1408451 P(node) =0.3558897
## class counts: 20 122
## probabilities: 0.141 0.859
## left son=6 (49 obs) right son=7 (93 obs)
## Primary splits:
## concave_points_mean < 0.063655 to the left, improve=24.58926, (0 missing)
## texture_mean < 16.795 to the left, improve=24.33156, (0 missing)
## texture_worst < 21.745 to the left, improve=24.33156, (0 missing)
## perimeter_worst < 116.05 to the left, improve=21.55216, (0 missing)
## concave_points_worst < 0.14905 to the left, improve=21.50773, (0 missing)
## Surrogate splits:
## concavity_mean < 0.10715 to the left, agree=0.894, adj=0.694, (0 split)
## concave_points_worst < 0.1463 to the left, agree=0.894, adj=0.694, (0 split)
## perimeter_worst < 118.25 to the left, agree=0.831, adj=0.510, (0 split)
## compactness_mean < 0.1027 to the left, agree=0.810, adj=0.449, (0 split)
## perimeter_se < 2.87 to the left, agree=0.803, adj=0.429, (0 split)
##
## Node number 4: 243 observations
## predicted class=0 expected loss=0.008230453 P(node) =0.6090226
## class counts: 241 2
## probabilities: 0.992 0.008
##
## Node number 5: 14 observations
## predicted class=1 expected loss=0.4285714 P(node) =0.03508772
## class counts: 6 8
## probabilities: 0.429 0.571
##
## Node number 6: 49 observations, complexity param=0.04924242
## predicted class=1 expected loss=0.4081633 P(node) =0.122807
## class counts: 20 29
## probabilities: 0.408 0.592
## left son=12 (13 obs) right son=13 (36 obs)
## Primary splits:
## texture_worst < 20.045 to the left, improve=15.399240, (0 missing)
## texture_mean < 15.745 to the left, improve=13.816120, (0 missing)
## area_worst < 957.45 to the left, improve= 7.383271, (0 missing)
## perimeter_worst < 128.05 to the left, improve= 6.846033, (0 missing)
## symmetry_worst < 0.31965 to the left, improve= 6.846033, (0 missing)
## Surrogate splits:
## texture_mean < 15.745 to the left, agree=0.980, adj=0.923, (0 split)
## radius_se < 0.2474 to the left, agree=0.796, adj=0.231, (0 split)
## texure_se < 0.47315 to the left, agree=0.796, adj=0.231, (0 split)
## area_se < 22.47 to the left, agree=0.796, adj=0.231, (0 split)
## compactness_mean < 0.1437 to the right, agree=0.776, adj=0.154, (0 split)
##
## Node number 7: 93 observations
## predicted class=1 expected loss=0 P(node) =0.2330827
## class counts: 0 93
## probabilities: 0.000 1.000
##
## Node number 12: 13 observations
## predicted class=0 expected loss=0 P(node) =0.03258145
## class counts: 13 0
## probabilities: 1.000 0.000
##
## Node number 13: 36 observations
## predicted class=1 expected loss=0.1944444 P(node) =0.09022556
## class counts: 7 29
## probabilities: 0.194 0.806
# Plotting decision tree using rpart.plot()
rpart.plot(DT, main="Decision Tree for medical diagnoses")
plotcp(DT)
# Insights : There are 5 leaf nodes in this decison tree
–Major predictors suggested by the tree are Perimeter_worst, concave_points_mean, Conace_poins_worst,Texure_mean. These are the major predictors because we are getting maximum information gain from each split
–Yes,Predictors from the Decision tree is same as the predictors we got from correlation matrix
– If the perimeter_worst is less than 107 and concave_points_mean greater than 0.064, then the person is likely to have cancerous tissue and the probability in this case is 100% – If the perimeter perimeter_worst is less than 107 and concave_points_mean is less than 0.064 and texture_worst is less than 20 then the person having cancerous tissue has a probability of 81%
# Predicting the model on train data
predict_train <-predict(DT, Final_train_data, type = 'class')
table_train <- table(Final_train_data$diagnosis, predict_train)
table_train
## predict_train
## 0 1
## 0 254 13
## 1 2 130
# Predicting the model on test data
predict_test <-predict(DT, df_test, type = 'class')
table_test <- table(df_test$diagnosis, predict_test)
table_test
## predict_test
## 0 1
## 0 28 4
## 1 2 23
# Accuracy of the model on train data
accuracy_Train <- sum(diag(table_train)) / sum(table_train)
print(paste('Accuracy for train', accuracy_Train))
## [1] "Accuracy for train 0.962406015037594"
# Findings : Accuracy for the train data is 96.2%
# Accuracy of the model on test data
accuracy_Test <- sum(diag(table_test)) / sum(table_test)
print(paste('Accuracy for test', accuracy_Test))
## [1] "Accuracy for test 0.894736842105263"
#Accuracy for test data is 89.4%
# Bulding a new decision tree to improve the accuracy
DT1 <- rpart(diagnosis ~ ., data=Final_train_data,parms = list(split="information") ,method="class",
control = rpart.control( minsplit = 10, minbucket = 5, cp = 0.01))
# Summary of decision tree
summary(DT1)
## Call:
## rpart(formula = diagnosis ~ ., data = Final_train_data, method = "class",
## parms = list(split = "information"), control = rpart.control(minsplit = 10,
## minbucket = 5, cp = 0.01))
## n= 399
##
## CP nsplit rel error xerror xstd
## 1 0.77272727 0 1.00000000 1.0000000 0.07120036
## 2 0.04924242 1 0.22727273 0.3787879 0.05009996
## 3 0.02651515 3 0.12878788 0.2272727 0.03990372
## 4 0.01893939 5 0.07575758 0.1893939 0.03667291
## 5 0.01000000 7 0.03787879 0.1893939 0.03667291
##
## Variable importance
## perimeter_worst radius_worst area_worst
## 16 14 13
## perimeter_mean area_mean radius_mean
## 12 12 12
## concave_points_worst concave_points_mean concavity_mean
## 4 3 2
## compactness_mean texture_worst texture_mean
## 2 2 2
## smoothness_worst perimeter_se compactness_worst
## 1 1 1
## concavity_worst texure_se
## 1 1
##
## Node number 1: 399 observations, complexity param=0.7727273
## predicted class=0 expected loss=0.3308271 P(node) =1
## class counts: 267 132
## probabilities: 0.669 0.331
## left son=2 (257 obs) right son=3 (142 obs)
## Primary splits:
## perimeter_worst < 107.2 to the left, improve=153.2803, (0 missing)
## concave_points_mean < 0.04892 to the left, improve=151.1599, (0 missing)
## radius_worst < 16.805 to the left, improve=146.7947, (0 missing)
## concave_points_worst < 0.14555 to the left, improve=145.5684, (0 missing)
## area_worst < 888.85 to the left, improve=145.0168, (0 missing)
## Surrogate splits:
## radius_worst < 16.205 to the left, agree=0.972, adj=0.923, (0 split)
## area_worst < 784.15 to the left, agree=0.972, adj=0.923, (0 split)
## perimeter_mean < 92.42 to the left, agree=0.940, adj=0.831, (0 split)
## area_mean < 632.8 to the left, agree=0.935, adj=0.817, (0 split)
## radius_mean < 14.15 to the left, agree=0.932, adj=0.810, (0 split)
##
## Node number 2: 257 observations, complexity param=0.02651515
## predicted class=0 expected loss=0.03891051 P(node) =0.6441103
## class counts: 247 10
## probabilities: 0.961 0.039
## left son=4 (243 obs) right son=5 (14 obs)
## Primary splits:
## concave_points_worst < 0.1352 to the left, improve=21.11549, (0 missing)
## smoothness_worst < 0.17725 to the left, improve=17.27677, (0 missing)
## concavity_mean < 0.09752 to the left, improve=16.77982, (0 missing)
## concave_points_mean < 0.05583 to the left, improve=16.39897, (0 missing)
## concavity_worst < 0.3967 to the left, improve=16.39897, (0 missing)
## Surrogate splits:
## compactness_mean < 0.1338 to the left, agree=0.969, adj=0.429, (0 split)
## concavity_mean < 0.11265 to the left, agree=0.969, adj=0.429, (0 split)
## smoothness_worst < 0.17725 to the left, agree=0.969, adj=0.429, (0 split)
## compactness_worst < 0.3932 to the left, agree=0.969, adj=0.429, (0 split)
## concavity_worst < 0.3967 to the left, agree=0.969, adj=0.429, (0 split)
##
## Node number 3: 142 observations, complexity param=0.04924242
## predicted class=1 expected loss=0.1408451 P(node) =0.3558897
## class counts: 20 122
## probabilities: 0.141 0.859
## left son=6 (49 obs) right son=7 (93 obs)
## Primary splits:
## concave_points_mean < 0.063655 to the left, improve=24.58926, (0 missing)
## texture_mean < 16.795 to the left, improve=24.33156, (0 missing)
## texture_worst < 21.745 to the left, improve=24.33156, (0 missing)
## perimeter_worst < 116.05 to the left, improve=21.55216, (0 missing)
## concave_points_worst < 0.14905 to the left, improve=21.50773, (0 missing)
## Surrogate splits:
## concavity_mean < 0.10715 to the left, agree=0.894, adj=0.694, (0 split)
## concave_points_worst < 0.1463 to the left, agree=0.894, adj=0.694, (0 split)
## perimeter_worst < 118.25 to the left, agree=0.831, adj=0.510, (0 split)
## compactness_mean < 0.1027 to the left, agree=0.810, adj=0.449, (0 split)
## perimeter_se < 2.87 to the left, agree=0.803, adj=0.429, (0 split)
##
## Node number 4: 243 observations
## predicted class=0 expected loss=0.008230453 P(node) =0.6090226
## class counts: 241 2
## probabilities: 0.992 0.008
##
## Node number 5: 14 observations, complexity param=0.02651515
## predicted class=1 expected loss=0.4285714 P(node) =0.03508772
## class counts: 6 8
## probabilities: 0.429 0.571
## left son=10 (7 obs) right son=11 (7 obs)
## Primary splits:
## texture_worst < 26.9 to the left, improve=6.689899, (0 missing)
## texture_mean < 20.05 to the left, improve=3.832086, (0 missing)
## symmetry_mean < 0.2044 to the left, improve=3.832086, (0 missing)
## smoothness_worst < 0.17825 to the left, improve=3.832086, (0 missing)
## fractal_dimension_worst < 0.11785 to the left, improve=3.832086, (0 missing)
## Surrogate splits:
## texture_mean < 18.42 to the left, agree=0.857, adj=0.714, (0 split)
## symmetry_mean < 0.20965 to the left, agree=0.786, adj=0.571, (0 split)
## texure_se < 1.0685 to the left, agree=0.786, adj=0.571, (0 split)
## symmetry_worst < 0.2679 to the left, agree=0.786, adj=0.571, (0 split)
## concave_points_mean < 0.04271 to the left, agree=0.714, adj=0.429, (0 split)
##
## Node number 6: 49 observations, complexity param=0.04924242
## predicted class=1 expected loss=0.4081633 P(node) =0.122807
## class counts: 20 29
## probabilities: 0.408 0.592
## left son=12 (13 obs) right son=13 (36 obs)
## Primary splits:
## texture_worst < 20.045 to the left, improve=15.399240, (0 missing)
## texture_mean < 15.745 to the left, improve=13.816120, (0 missing)
## area_worst < 957.45 to the left, improve= 7.383271, (0 missing)
## perimeter_worst < 128.05 to the left, improve= 6.846033, (0 missing)
## symmetry_worst < 0.31965 to the left, improve= 6.846033, (0 missing)
## Surrogate splits:
## texture_mean < 15.745 to the left, agree=0.980, adj=0.923, (0 split)
## radius_se < 0.2474 to the left, agree=0.796, adj=0.231, (0 split)
## texure_se < 0.47315 to the left, agree=0.796, adj=0.231, (0 split)
## area_se < 22.47 to the left, agree=0.796, adj=0.231, (0 split)
## compactness_mean < 0.1437 to the right, agree=0.776, adj=0.154, (0 split)
##
## Node number 7: 93 observations
## predicted class=1 expected loss=0 P(node) =0.2330827
## class counts: 0 93
## probabilities: 0.000 1.000
##
## Node number 10: 7 observations
## predicted class=0 expected loss=0.1428571 P(node) =0.01754386
## class counts: 6 1
## probabilities: 0.857 0.143
##
## Node number 11: 7 observations
## predicted class=1 expected loss=0 P(node) =0.01754386
## class counts: 0 7
## probabilities: 0.000 1.000
##
## Node number 12: 13 observations
## predicted class=0 expected loss=0 P(node) =0.03258145
## class counts: 13 0
## probabilities: 1.000 0.000
##
## Node number 13: 36 observations, complexity param=0.01893939
## predicted class=1 expected loss=0.1944444 P(node) =0.09022556
## class counts: 7 29
## probabilities: 0.194 0.806
## left son=26 (12 obs) right son=27 (24 obs)
## Primary splits:
## radius_worst < 16.8 to the left, improve=5.259041, (0 missing)
## concave_points_se < 0.0099805 to the right, improve=4.784799, (0 missing)
## area_worst < 871.8 to the left, improve=4.647885, (0 missing)
## smoothness_worst < 0.13645 to the left, improve=3.972922, (0 missing)
## concave_points_mean < 0.048785 to the left, improve=3.618231, (0 missing)
## Surrogate splits:
## area_worst < 871.8 to the left, agree=0.972, adj=0.917, (0 split)
## radius_mean < 15.045 to the left, agree=0.889, adj=0.667, (0 split)
## area_mean < 697.8 to the left, agree=0.889, adj=0.667, (0 split)
## perimeter_worst < 111.7 to the left, agree=0.861, adj=0.583, (0 split)
## perimeter_mean < 94.485 to the left, agree=0.806, adj=0.417, (0 split)
##
## Node number 26: 12 observations, complexity param=0.01893939
## predicted class=0 expected loss=0.5 P(node) =0.03007519
## class counts: 6 6
## probabilities: 0.500 0.500
## left son=52 (5 obs) right son=53 (7 obs)
## Primary splits:
## concave_points_mean < 0.048785 to the left, improve=5.446952, (0 missing)
## smoothness_worst < 0.13755 to the left, improve=5.446952, (0 missing)
## smoothness_mean < 0.097515 to the left, improve=2.911032, (0 missing)
## smoothness_se < 0.0053495 to the left, improve=2.911032, (0 missing)
## radius_mean < 14.335 to the right, improve=1.627867, (0 missing)
## Surrogate splits:
## smoothness_mean < 0.09218 to the left, agree=0.917, adj=0.8, (0 split)
## compactness_se < 0.03024 to the right, agree=0.833, adj=0.6, (0 split)
## smoothness_worst < 0.13755 to the left, agree=0.833, adj=0.6, (0 split)
## texture_mean < 21.915 to the right, agree=0.750, adj=0.4, (0 split)
## perimeter_mean < 95.145 to the right, agree=0.750, adj=0.4, (0 split)
##
## Node number 27: 24 observations
## predicted class=1 expected loss=0.04166667 P(node) =0.06015038
## class counts: 1 23
## probabilities: 0.042 0.958
##
## Node number 52: 5 observations
## predicted class=0 expected loss=0 P(node) =0.01253133
## class counts: 5 0
## probabilities: 1.000 0.000
##
## Node number 53: 7 observations
## predicted class=1 expected loss=0.1428571 P(node) =0.01754386
## class counts: 1 6
## probabilities: 0.143 0.857
# Predicting the model on train data
DT1_train <- table(pred=predict(DT1,Final_train_data, type="class"), true=Final_train_data$diagnosis)
# Predicting the model on test data
DT1_test <- table(pred=predict(DT1,df_test, type="class"), true=df_test$diagnosis)
# Accuracy of train data
accuracy_Train_DT1 <- sum(diag(DT1_train)) / sum(DT1_train)
accuracy_Train_DT1
## [1] 0.9874687
# Insights : Accuracy of train data is 98.74%
# Accuracy of test data
accuracy_Test_DT1 <- sum(diag(DT1_test)) / sum(DT1_test)
accuracy_Test_DT1
## [1] 0.9473684
# Insights : Accuracy of test data is 94.73%
# Confusion matrix
confusionMatrix(DT1_train,reference = Final_train_data$diagnosis)
## Confusion Matrix and Statistics
##
## true
## pred 0 1
## 0 265 3
## 1 2 129
##
## Accuracy : 0.9875
## 95% CI : (0.971, 0.9959)
## No Information Rate : 0.6692
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9716
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9925
## Specificity : 0.9773
## Pos Pred Value : 0.9888
## Neg Pred Value : 0.9847
## Prevalence : 0.6692
## Detection Rate : 0.6642
## Detection Prevalence : 0.6717
## Balanced Accuracy : 0.9849
##
## 'Positive' Class : 0
##
#Insights : 1. Accuracy : 98.75%
# 2.Sensitivity : 0.984
# 3.Specificity : 0.984
# ROC curve for new model DT1
scoreTst <- predict(DT1, df_test, type="prob")[, 2]
scoreTst
## 1 2 3 4 5 6
## 0.008230453 0.008230453 0.008230453 1.000000000 1.000000000 1.000000000
## 7 8 9 10 11 12
## 0.008230453 0.958333333 0.008230453 0.008230453 0.000000000 1.000000000
## 13 14 15 16 17 18
## 0.008230453 0.008230453 1.000000000 0.000000000 0.008230453 0.008230453
## 19 20 21 22 23 24
## 0.958333333 0.142857143 1.000000000 0.008230453 0.008230453 0.008230453
## 25 26 27 28 29 30
## 0.008230453 0.008230453 0.000000000 0.008230453 1.000000000 1.000000000
## 31 32 33 34 35 36
## 0.008230453 0.008230453 0.008230453 1.000000000 0.958333333 1.000000000
## 37 38 39 40 41 42
## 0.008230453 0.008230453 0.008230453 0.008230453 1.000000000 1.000000000
## 43 44 45 46 47 48
## 1.000000000 1.000000000 0.008230453 0.008230453 1.000000000 1.000000000
## 49 50 51 52 53 54
## 1.000000000 1.000000000 0.008230453 0.008230453 0.008230453 1.000000000
## 55 56 57
## 0.958333333 1.000000000 0.008230453
#now apply the prediction function from ROCR to get a prediction object
rocPredTst <- prediction(scoreTst, df_test$diagnosis, label.ordering = c('0', '1'))
#obtain performance using the function from ROCR, then plot
perfROCTst<-ROCR::performance(rocPredTst,"tpr","fpr")
# ROC curve for a initial model DT
scoreTst_DT <- predict(DT, df_test, type="prob")[,2]
rocPredTst_DT <- prediction(scoreTst_DT, df_test$diagnosis,label.ordering = c('0', '1'))
perfROCTst_DT <- ROCR::performance(rocPredTst_DT, "tpr", "fpr")
plot(perfROCTst)
plot(perfROCTst_DT, add=TRUE, col="blue")
# AUC value for Final model DT1
aucPerf_final=ROCR::performance(rocPredTst, "auc")
aucPerf_final@y.values
## [[1]]
## [1] 0.944375
#Findings : AUC value for DT1 is 0.944
# AUC value for initial model DT
aucPerf_initial=ROCR::performance(rocPredTst_DT, "auc")
aucPerf_initial@y.values
## [[1]]
## [1] 0.926875
# Findings : AUC value for DT is 0.9268
rpart.plot(DT1, main="Final Decision Tree for medical diagnoses")
#Decision rules :
#1. Diagnosis is the factor variable
#2. We are splitting it on information
#3. We are using a minimum split of 10
#4. We are using minimum bucket of 5
#5. We are building a decision tree with complexity parameter(cp) 0.01
df_phi <-read.csv("Training Dataset.arff", header = FALSE, comment.char = "@")
names(df_phi) <- c('having_IP_Address', 'URL_Length', 'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH', 'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page', 'Statistical_report', 'Result')
head(df_phi)
#If we have only -1 and 1 in a column then -1 = phishing and 1 = legitimate #If we have -1,0,1 values in a column then -1=legitimate, 0=suspicious, 1= phishing
#structure of dataframe
str(df_phi)
## 'data.frame': 11055 obs. of 31 variables:
## $ having_IP_Address : int -1 1 1 1 1 -1 1 1 1 1 ...
## $ URL_Length : int 1 1 0 0 0 0 0 0 0 1 ...
## $ Shortining_Service : int 1 1 1 1 -1 -1 -1 1 -1 -1 ...
## $ having_At_Symbol : int 1 1 1 1 1 1 1 1 1 1 ...
## $ double_slash_redirecting : int -1 1 1 1 1 -1 1 1 1 1 ...
## $ Prefix_Suffix : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ having_Sub_Domain : int -1 0 -1 -1 1 1 -1 -1 1 -1 ...
## $ SSLfinal_State : int -1 1 -1 -1 1 1 -1 -1 1 1 ...
## $ Domain_registeration_length: int -1 -1 -1 1 -1 -1 1 1 -1 -1 ...
## $ Favicon : int 1 1 1 1 1 1 1 1 1 1 ...
## $ port : int 1 1 1 1 1 1 1 1 1 1 ...
## $ HTTPS_token : int -1 -1 -1 -1 1 -1 1 -1 -1 1 ...
## $ Request_URL : int 1 1 1 -1 1 1 -1 -1 1 1 ...
## $ URL_of_Anchor : int -1 0 0 0 0 0 -1 0 0 0 ...
## $ Links_in_tags : int 1 -1 -1 0 0 0 0 -1 1 1 ...
## $ SFH : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ Submitting_to_email : int -1 1 -1 1 1 -1 -1 1 1 1 ...
## $ Abnormal_URL : int -1 1 -1 1 1 -1 -1 1 1 1 ...
## $ Redirect : int 0 0 0 0 0 0 0 0 0 0 ...
## $ on_mouseover : int 1 1 1 1 -1 1 1 1 1 1 ...
## $ RightClick : int 1 1 1 1 1 1 1 1 1 1 ...
## $ popUpWidnow : int 1 1 1 1 -1 1 1 1 1 1 ...
## $ Iframe : int 1 1 1 1 1 1 1 1 1 1 ...
## $ age_of_domain : int -1 -1 1 -1 -1 1 1 -1 1 1 ...
## $ DNSRecord : int -1 -1 -1 -1 -1 1 -1 -1 -1 -1 ...
## $ web_traffic : int -1 0 1 1 0 1 -1 0 1 0 ...
## $ Page_Rank : int -1 -1 -1 -1 -1 -1 -1 -1 1 -1 ...
## $ Google_Index : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Links_pointing_to_page : int 1 1 0 -1 1 -1 0 0 0 0 ...
## $ Statistical_report : int -1 1 -1 1 1 -1 -1 1 1 1 ...
## $ Result : int -1 -1 -1 -1 1 1 -1 -1 1 -1 ...
#Findings : There are 31 fields with 11055 rows in train dataset
#summary
summary(df_phi)
## having_IP_Address URL_Length Shortining_Service having_At_Symbol
## Min. :-1.0000 Min. :-1.0000 Min. :-1.0000 Min. :-1.0000
## 1st Qu.:-1.0000 1st Qu.:-1.0000 1st Qu.: 1.0000 1st Qu.: 1.0000
## Median : 1.0000 Median :-1.0000 Median : 1.0000 Median : 1.0000
## Mean : 0.3138 Mean :-0.6332 Mean : 0.7388 Mean : 0.7006
## 3rd Qu.: 1.0000 3rd Qu.:-1.0000 3rd Qu.: 1.0000 3rd Qu.: 1.0000
## Max. : 1.0000 Max. : 1.0000 Max. : 1.0000 Max. : 1.0000
## double_slash_redirecting Prefix_Suffix having_Sub_Domain SSLfinal_State
## Min. :-1.0000 Min. :-1.000 Min. :-1.00000 Min. :-1.0000
## 1st Qu.: 1.0000 1st Qu.:-1.000 1st Qu.:-1.00000 1st Qu.:-1.0000
## Median : 1.0000 Median :-1.000 Median : 0.00000 Median : 1.0000
## Mean : 0.7415 Mean :-0.735 Mean : 0.06395 Mean : 0.2509
## 3rd Qu.: 1.0000 3rd Qu.:-1.000 3rd Qu.: 1.00000 3rd Qu.: 1.0000
## Max. : 1.0000 Max. : 1.000 Max. : 1.00000 Max. : 1.0000
## Domain_registeration_length Favicon port
## Min. :-1.0000 Min. :-1.0000 Min. :-1.0000
## 1st Qu.:-1.0000 1st Qu.: 1.0000 1st Qu.: 1.0000
## Median :-1.0000 Median : 1.0000 Median : 1.0000
## Mean :-0.3368 Mean : 0.6286 Mean : 0.7283
## 3rd Qu.: 1.0000 3rd Qu.: 1.0000 3rd Qu.: 1.0000
## Max. : 1.0000 Max. : 1.0000 Max. : 1.0000
## HTTPS_token Request_URL URL_of_Anchor Links_in_tags
## Min. :-1.0000 Min. :-1.0000 Min. :-1.00000 Min. :-1.0000
## 1st Qu.: 1.0000 1st Qu.:-1.0000 1st Qu.:-1.00000 1st Qu.:-1.0000
## Median : 1.0000 Median : 1.0000 Median : 0.00000 Median : 0.0000
## Mean : 0.6751 Mean : 0.1868 Mean :-0.07653 Mean :-0.1181
## 3rd Qu.: 1.0000 3rd Qu.: 1.0000 3rd Qu.: 0.00000 3rd Qu.: 0.0000
## Max. : 1.0000 Max. : 1.0000 Max. : 1.00000 Max. : 1.0000
## SFH Submitting_to_email Abnormal_URL Redirect
## Min. :-1.0000 Min. :-1.0000 Min. :-1.0000 Min. :0.0000
## 1st Qu.:-1.0000 1st Qu.: 1.0000 1st Qu.: 1.0000 1st Qu.:0.0000
## Median :-1.0000 Median : 1.0000 Median : 1.0000 Median :0.0000
## Mean :-0.5957 Mean : 0.6356 Mean : 0.7053 Mean :0.1157
## 3rd Qu.:-1.0000 3rd Qu.: 1.0000 3rd Qu.: 1.0000 3rd Qu.:0.0000
## Max. : 1.0000 Max. : 1.0000 Max. : 1.0000 Max. :1.0000
## on_mouseover RightClick popUpWidnow Iframe
## Min. :-1.0000 Min. :-1.0000 Min. :-1.0000 Min. :-1.0000
## 1st Qu.: 1.0000 1st Qu.: 1.0000 1st Qu.: 1.0000 1st Qu.: 1.0000
## Median : 1.0000 Median : 1.0000 Median : 1.0000 Median : 1.0000
## Mean : 0.7621 Mean : 0.9139 Mean : 0.6134 Mean : 0.8169
## 3rd Qu.: 1.0000 3rd Qu.: 1.0000 3rd Qu.: 1.0000 3rd Qu.: 1.0000
## Max. : 1.0000 Max. : 1.0000 Max. : 1.0000 Max. : 1.0000
## age_of_domain DNSRecord web_traffic Page_Rank
## Min. :-1.00000 Min. :-1.0000 Min. :-1.0000 Min. :-1.0000
## 1st Qu.:-1.00000 1st Qu.:-1.0000 1st Qu.: 0.0000 1st Qu.:-1.0000
## Median : 1.00000 Median : 1.0000 Median : 1.0000 Median :-1.0000
## Mean : 0.06124 Mean : 0.3771 Mean : 0.2873 Mean :-0.4837
## 3rd Qu.: 1.00000 3rd Qu.: 1.0000 3rd Qu.: 1.0000 3rd Qu.: 1.0000
## Max. : 1.00000 Max. : 1.0000 Max. : 1.0000 Max. : 1.0000
## Google_Index Links_pointing_to_page Statistical_report Result
## Min. :-1.0000 Min. :-1.000 Min. :-1.0000 Min. :-1.0000
## 1st Qu.: 1.0000 1st Qu.: 0.000 1st Qu.: 1.0000 1st Qu.:-1.0000
## Median : 1.0000 Median : 0.000 Median : 1.0000 Median : 1.0000
## Mean : 0.7216 Mean : 0.344 Mean : 0.7196 Mean : 0.1139
## 3rd Qu.: 1.0000 3rd Qu.: 1.000 3rd Qu.: 1.0000 3rd Qu.: 1.0000
## Max. : 1.0000 Max. : 1.000 Max. : 1.0000 Max. : 1.0000
# Checking for missing values
colSums(is.na(df_phi))
## having_IP_Address URL_Length
## 0 0
## Shortining_Service having_At_Symbol
## 0 0
## double_slash_redirecting Prefix_Suffix
## 0 0
## having_Sub_Domain SSLfinal_State
## 0 0
## Domain_registeration_length Favicon
## 0 0
## port HTTPS_token
## 0 0
## Request_URL URL_of_Anchor
## 0 0
## Links_in_tags SFH
## 0 0
## Submitting_to_email Abnormal_URL
## 0 0
## Redirect on_mouseover
## 0 0
## RightClick popUpWidnow
## 0 0
## Iframe age_of_domain
## 0 0
## DNSRecord web_traffic
## 0 0
## Page_Rank Google_Index
## 0 0
## Links_pointing_to_page Statistical_report
## 0 0
## Result
## 0
# There are no missing values in the dataset
# Detecting outliers in the dataset
# Plotting Boxplot to find the outliers
boxplot(df_phi,main = "Outlier detection of all columns using box plot")
# Insights : As the data is either 0,1,-1 there are no outliers due to this we are not performing outlier treatment
# calculate correlation matrix
correlationMatrix <- cor(df_phi)
# summarize the correlation matrix
print(correlationMatrix)
## having_IP_Address URL_Length Shortining_Service
## having_IP_Address 1.000000000 -0.0524107388 0.4034610930
## URL_Length -0.052410739 1.0000000000 -0.0978809097
## Shortining_Service 0.403461093 -0.0978809097 1.0000000000
## having_At_Symbol 0.158698951 -0.0751084756 0.1044465494
## double_slash_redirecting 0.397389087 -0.0812470792 0.8427956224
## Prefix_Suffix -0.005256975 0.0552467160 -0.0804705533
## having_Sub_Domain -0.080744639 0.0039968791 -0.0419161818
## SSLfinal_State 0.071414500 0.0487537328 -0.0614256521
## Domain_registeration_length -0.022739206 -0.2218924309 0.0609231944
## Favicon 0.087024829 -0.0424972200 0.0061006489
## port 0.060979206 0.0003229483 0.0022008313
## HTTPS_token 0.363534482 -0.0893825485 0.7578377012
## Request_URL 0.029772867 0.2463480961 -0.0372345268
## URL_of_Anchor 0.099846955 -0.0233955096 0.0005614793
## Links_in_tags 0.006212404 0.0528690722 -0.1333791164
## SFH -0.010962287 0.4141962193 -0.0227234768
## Submitting_to_email 0.077989153 -0.0144574973 0.0493282274
## Abnormal_URL 0.336549357 -0.1067608602 0.7392895721
## Redirect -0.321181419 0.0468322384 -0.5345296596
## on_mouseover 0.084059316 -0.0451030375 0.0623834682
## RightClick 0.042881431 -0.0136133645 0.0381184510
## popUpWidnow 0.096882295 -0.0493812351 0.0366157897
## Iframe 0.054694437 -0.0138382241 0.0165807574
## age_of_domain -0.010445721 0.1794264244 -0.0525958372
## DNSRecord -0.050733303 -0.0408233717 0.4360642903
## web_traffic 0.002922205 0.0089927816 -0.0470743710
## Page_Rank -0.091773751 0.1835179731 0.0145913709
## Google_Index 0.029152889 0.0029024154 0.1558437456
## Links_pointing_to_page -0.339065107 -0.0229874156 -0.1984097700
## Statistical_report -0.019102515 -0.0671530890 0.0854607583
## Result 0.094160095 0.0574296293 -0.0679658927
## having_At_Symbol double_slash_redirecting
## having_IP_Address 0.158698951 0.397389087
## URL_Length -0.075108476 -0.081247079
## Shortining_Service 0.104446549 0.842795622
## having_At_Symbol 1.000000000 0.086959950
## double_slash_redirecting 0.086959950 1.000000000
## Prefix_Suffix -0.011725539 -0.085590400
## having_Sub_Domain -0.058975763 -0.043078808
## SSLfinal_State 0.031220000 -0.036199897
## Domain_registeration_length 0.015521535 0.047464231
## Favicon 0.304899055 0.035100076
## port 0.364890515 0.025060376
## HTTPS_token 0.104561007 0.760799398
## Request_URL 0.027909185 -0.026367650
## URL_of_Anchor 0.057913889 -0.005035903
## Links_in_tags -0.070861354 -0.125582670
## SFH -0.008671648 -0.041672182
## Submitting_to_email 0.370122659 0.031897965
## Abnormal_URL 0.203944877 0.723723563
## Redirect -0.028159591 -0.591477929
## on_mouseover 0.279697003 0.086634704
## RightClick 0.219503021 0.025863086
## popUpWidnow 0.290892843 0.054462617
## Iframe 0.284409945 0.010458641
## age_of_domain -0.005499097 -0.050106635
## DNSRecord -0.047871518 0.431409480
## web_traffic 0.032918391 -0.062369383
## Page_Rank -0.064735176 -0.003132332
## Google_Index 0.037061311 0.178414903
## Links_pointing_to_page -0.006080372 -0.194164634
## Statistical_report -0.080356601 0.070389942
## Result 0.052947789 -0.038607612
## Prefix_Suffix having_Sub_Domain SSLfinal_State
## having_IP_Address -0.005256975 -0.080744639 0.071414500
## URL_Length 0.055246716 0.003996879 0.048753733
## Shortining_Service -0.080470553 -0.041916182 -0.061425652
## having_At_Symbol -0.011725539 -0.058975763 0.031220000
## double_slash_redirecting -0.085590400 -0.043078808 -0.036199897
## Prefix_Suffix 1.000000000 0.087891090 0.261390532
## having_Sub_Domain 0.087891090 1.000000000 0.267648755
## SSLfinal_State 0.261390532 0.267648755 1.000000000
## Domain_registeration_length -0.096798530 -0.082838574 -0.193622046
## Favicon -0.007504228 -0.016704372 -0.014756696
## port -0.022545832 0.004862571 0.027472848
## HTTPS_token -0.070153106 -0.037239209 -0.029941373
## Request_URL 0.098674835 0.104856615 0.193054440
## URL_of_Anchor 0.348871199 0.229490709 0.535786191
## Links_in_tags 0.100253994 0.093645954 0.176825163
## SFH 0.001325541 0.096088828 0.171402389
## Submitting_to_email -0.045000319 0.008829840 0.008061500
## Abnormal_URL -0.077620303 -0.034907592 -0.046245129
## Redirect 0.016271426 0.031205629 -0.021069747
## on_mouseover 0.012578297 -0.018082180 0.023585818
## RightClick -0.024868266 0.018229873 0.015854372
## popUpWidnow -0.014732588 -0.025312228 -0.013004887
## Iframe -0.036904442 0.010636524 -0.002773194
## age_of_domain 0.074116234 0.119253766 0.162809420
## DNSRecord -0.016555559 0.125493384 0.050971780
## web_traffic 0.110597625 -0.005763910 0.258767835
## Page_Rank -0.006833928 0.120730067 0.074545009
## Google_Index 0.067780621 0.057672540 0.096051392
## Links_pointing_to_page 0.067423361 -0.010525803 -0.011710227
## Statistical_report -0.002762565 0.081627379 0.063410931
## Result 0.348605570 0.298323324 0.714741195
## Domain_registeration_length Favicon
## having_IP_Address -0.022739206 0.0870248294
## URL_Length -0.221892431 -0.0424972200
## Shortining_Service 0.060923194 0.0061006489
## having_At_Symbol 0.015521535 0.3048990547
## double_slash_redirecting 0.047464231 0.0351000758
## Prefix_Suffix -0.096798530 -0.0075042284
## having_Sub_Domain -0.082838574 -0.0167043723
## SSLfinal_State -0.193622046 -0.0147566962
## Domain_registeration_length 1.000000000 0.0542534505
## Favicon 0.054253451 1.0000000000
## port 0.022477551 0.8038335605
## HTTPS_token 0.059161071 0.0494830293
## Request_URL -0.609969688 -0.0046204010
## URL_of_Anchor -0.160257307 0.0376978302
## Links_in_tags -0.101084122 -0.1003410487
## SFH -0.136421980 -0.0122792023
## Submitting_to_email 0.039260343 0.6683166076
## Abnormal_URL 0.058108720 0.0718477294
## Redirect -0.016299937 -0.0156209326
## on_mouseover 0.023783847 0.7061793318
## RightClick 0.023520437 0.4143822065
## popUpWidnow 0.051409859 0.9396329250
## Iframe 0.004393378 0.6276072100
## age_of_domain -0.062851285 -0.0026278428
## DNSRecord -0.010476736 0.0882106587
## web_traffic -0.134454334 -0.0509218451
## Page_Rank -0.059898164 0.0116992227
## Google_Index -0.039765764 -0.0166677106
## Links_pointing_to_page 0.122671569 -0.1272430958
## Statistical_report -0.002212040 0.3009172188
## Result -0.225789462 -0.0002795247
## port HTTPS_token Request_URL
## having_IP_Address 0.0609792061 0.363534482 0.029772867
## URL_Length 0.0003229483 -0.089382549 0.246348096
## Shortining_Service 0.0022008313 0.757837701 -0.037234527
## having_At_Symbol 0.3648905154 0.104561007 0.027909185
## double_slash_redirecting 0.0250603764 0.760799398 -0.026367650
## Prefix_Suffix -0.0225458322 -0.070153106 0.098674835
## having_Sub_Domain 0.0048625706 -0.037239209 0.104856615
## SSLfinal_State 0.0274728481 -0.029941373 0.193054440
## Domain_registeration_length 0.0224775512 0.059161071 -0.609969688
## Favicon 0.8038335605 0.049483029 -0.004620401
## port 1.0000000000 0.004998623 0.027561329
## HTTPS_token 0.0049986228 1.000000000 -0.006619689
## Request_URL 0.0275613290 -0.006619689 1.000000000
## URL_of_Anchor 0.0398913780 0.011850847 0.177693201
## Links_in_tags -0.0665020356 -0.104381017 0.067491008
## SFH 0.0066721989 -0.009679787 0.126660605
## Submitting_to_email 0.7990881593 0.075477795 0.018177694
## Abnormal_URL 0.0541264070 0.716287367 -0.036033554
## Redirect -0.0224719854 -0.460164543 0.002329259
## on_mouseover 0.6232982087 0.110113271 0.008144032
## RightClick 0.4816308909 0.009264613 -0.020451804
## popUpWidnow 0.7485171731 0.066956881 -0.004622095
## Iframe 0.6870441024 0.017508903 0.016933672
## age_of_domain 0.0084587860 -0.049632114 0.090455473
## DNSRecord 0.0548490936 0.395386634 0.015932566
## web_traffic -0.0285426924 -0.039708370 0.161166301
## Page_Rank 0.0179537185 0.021104115 0.055734008
## Google_Index -0.0054130396 0.115449951 0.046408594
## Links_pointing_to_page -0.1391039546 -0.128724152 -0.067109020
## Statistical_report 0.3439868065 0.096186734 0.035411775
## Result 0.0364188509 -0.039853895 0.253372272
## URL_of_Anchor Links_in_tags SFH
## having_IP_Address 0.0998469553 0.006212404 -0.010962287
## URL_Length -0.0233955096 0.052869072 0.414196219
## Shortining_Service 0.0005614793 -0.133379116 -0.022723477
## having_At_Symbol 0.0579138894 -0.070861354 -0.008671648
## double_slash_redirecting -0.0050359032 -0.125582670 -0.041672182
## Prefix_Suffix 0.3488711994 0.100253994 0.001325541
## having_Sub_Domain 0.2294907094 0.093645954 0.096088828
## SSLfinal_State 0.5357861908 0.176825163 0.171402389
## Domain_registeration_length -0.1602573071 -0.101084122 -0.136421980
## Favicon 0.0376978302 -0.100341049 -0.012279202
## port 0.0398913780 -0.066502036 0.006672199
## HTTPS_token 0.0118508466 -0.104381017 -0.009679787
## Request_URL 0.1776932006 0.067491008 0.126660605
## URL_of_Anchor 1.0000000000 0.136283304 0.114311328
## Links_in_tags 0.1362833037 1.000000000 0.066597600
## SFH 0.1143113279 0.066597600 1.000000000
## Submitting_to_email 0.0333858654 -0.043231306 0.011472617
## Abnormal_URL -0.0105852295 -0.116065466 -0.030751841
## Redirect -0.0008394492 0.041497199 0.049907173
## on_mouseover 0.0677416541 -0.077669580 0.007579218
## RightClick 0.0221681580 -0.037469195 0.008467338
## popUpWidnow 0.0411501923 -0.112281660 -0.004862726
## Iframe 0.0134027350 -0.070029712 0.007066971
## age_of_domain 0.0755081304 0.078056784 -0.015839916
## DNSRecord 0.0932883535 -0.038544709 0.034439508
## web_traffic 0.3262932300 0.064548051 0.052706363
## Page_Rank 0.0992608715 -0.006449921 0.001978560
## Google_Index 0.0388162297 0.045557482 0.027587678
## Links_pointing_to_page 0.0186506323 0.013561478 -0.009067706
## Statistical_report 0.0773766300 -0.087343254 -0.005288871
## Result 0.6929345206 0.248228515 0.221419008
## Submitting_to_email Abnormal_URL Redirect
## having_IP_Address 0.07798915 0.336549357 -0.3211814194
## URL_Length -0.01445750 -0.106760860 0.0468322384
## Shortining_Service 0.04932823 0.739289572 -0.5345296596
## having_At_Symbol 0.37012266 0.203944877 -0.0281595909
## double_slash_redirecting 0.03189797 0.723723563 -0.5914779293
## Prefix_Suffix -0.04500032 -0.077620303 0.0162714260
## having_Sub_Domain 0.00882984 -0.034907592 0.0312056293
## SSLfinal_State 0.00806150 -0.046245129 -0.0210697467
## Domain_registeration_length 0.03926034 0.058108720 -0.0162999368
## Favicon 0.66831661 0.071847729 -0.0156209326
## port 0.79908816 0.054126407 -0.0224719854
## HTTPS_token 0.07547780 0.716287367 -0.4601645427
## Request_URL 0.01817769 -0.036033554 0.0023292586
## URL_of_Anchor 0.03338587 -0.010585229 -0.0008394492
## Links_in_tags -0.04323131 -0.116065466 0.0414971992
## SFH 0.01147262 -0.030751841 0.0499071729
## Submitting_to_email 1.00000000 0.195850077 -0.0073206100
## Abnormal_URL 0.19585008 1.000000000 -0.4591870655
## Redirect -0.00732061 -0.459187066 1.0000000000
## on_mouseover 0.53165607 0.117637628 -0.0348225195
## RightClick 0.39862940 0.023710312 -0.0235864240
## popUpWidnow 0.62946227 0.091188160 -0.0263268337
## Iframe 0.57749027 0.017590440 -0.0126676610
## age_of_domain 0.00735732 -0.032532658 -0.0224755413
## DNSRecord 0.06414537 0.366833384 -0.2110959875
## web_traffic -0.01568455 -0.052415797 0.0046314208
## Page_Rank 0.02620836 0.007317677 0.0528666040
## Google_Index -0.00837762 0.124751201 0.0572302894
## Links_pointing_to_page -0.03995573 -0.161026568 0.1612777884
## Statistical_report 0.35207398 0.186399046 -0.0591942408
## Result 0.01824901 -0.060487642 -0.0201134617
## on_mouseover RightClick popUpWidnow
## having_IP_Address 0.084059316 0.042881431 9.688229e-02
## URL_Length -0.045103038 -0.013613365 -4.938124e-02
## Shortining_Service 0.062383468 0.038118451 3.661579e-02
## having_At_Symbol 0.279697003 0.219503021 2.908928e-01
## double_slash_redirecting 0.086634704 0.025863086 5.446262e-02
## Prefix_Suffix 0.012578297 -0.024868266 -1.473259e-02
## having_Sub_Domain -0.018082180 0.018229873 -2.531223e-02
## SSLfinal_State 0.023585818 0.015854372 -1.300489e-02
## Domain_registeration_length 0.023783847 0.023520437 5.140986e-02
## Favicon 0.706179332 0.414382207 9.396329e-01
## port 0.623298209 0.481630891 7.485172e-01
## HTTPS_token 0.110113271 0.009264613 6.695688e-02
## Request_URL 0.008144032 -0.020451804 -4.622095e-03
## URL_of_Anchor 0.067741654 0.022168158 4.115019e-02
## Links_in_tags -0.077669580 -0.037469195 -1.122817e-01
## SFH 0.007579218 0.008467338 -4.862726e-03
## Submitting_to_email 0.531656071 0.398629404 6.294623e-01
## Abnormal_URL 0.117637628 0.023710312 9.118816e-02
## Redirect -0.034822519 -0.023586424 -2.632683e-02
## on_mouseover 1.000000000 0.474054044 7.336288e-01
## RightClick 0.474054044 1.000000000 4.152681e-01
## popUpWidnow 0.733628798 0.415268089 1.000000e+00
## Iframe 0.659478008 0.655862534 6.294063e-01
## age_of_domain 0.013305706 0.006763936 -9.482202e-04
## DNSRecord 0.087161413 0.038254940 9.865790e-02
## web_traffic -0.036531254 -0.013594318 -4.319003e-02
## Page_Rank 0.015633688 0.025341161 1.711415e-02
## Google_Index -0.006510003 -0.008065574 -1.025554e-02
## Links_pointing_to_page -0.038551028 -0.119831033 -1.213255e-01
## Statistical_report 0.277346725 0.204409183 2.852606e-01
## Result 0.041838440 0.012653235 8.588679e-05
## Iframe age_of_domain DNSRecord web_traffic
## having_IP_Address 0.054694437 -0.0104457207 -0.05073330 0.002922205
## URL_Length -0.013838224 0.1794264244 -0.04082337 0.008992782
## Shortining_Service 0.016580757 -0.0525958372 0.43606429 -0.047074371
## having_At_Symbol 0.284409945 -0.0054990967 -0.04787152 0.032918391
## double_slash_redirecting 0.010458641 -0.0501066349 0.43140948 -0.062369383
## Prefix_Suffix -0.036904442 0.0741162339 -0.01655556 0.110597625
## having_Sub_Domain 0.010636524 0.1192537659 0.12549338 -0.005763910
## SSLfinal_State -0.002773194 0.1628094196 0.05097178 0.258767835
## Domain_registeration_length 0.004393378 -0.0628512848 -0.01047674 -0.134454334
## Favicon 0.627607210 -0.0026278428 0.08821066 -0.050921845
## port 0.687044102 0.0084587860 0.05484909 -0.028542692
## HTTPS_token 0.017508903 -0.0496321141 0.39538663 -0.039708370
## Request_URL 0.016933672 0.0904554734 0.01593257 0.161166301
## URL_of_Anchor 0.013402735 0.0755081304 0.09328835 0.326293230
## Links_in_tags -0.070029712 0.0780567844 -0.03854471 0.064548051
## SFH 0.007066971 -0.0158399160 0.03443951 0.052706363
## Submitting_to_email 0.577490269 0.0073573196 0.06414537 -0.015684545
## Abnormal_URL 0.017590440 -0.0325326580 0.36683338 -0.052415797
## Redirect -0.012667661 -0.0224755413 -0.21109599 0.004631421
## on_mouseover 0.659478008 0.0133057058 0.08716141 -0.036531254
## RightClick 0.655862534 0.0067639361 0.03825494 -0.013594318
## popUpWidnow 0.629406251 -0.0009482202 0.09865790 -0.043190028
## Iframe 1.000000000 0.0188476779 0.04729313 -0.022079555
## age_of_domain 0.018847678 1.0000000000 -0.03408235 0.089948950
## DNSRecord 0.047293133 -0.0340823497 1.00000000 0.048649503
## web_traffic -0.022079555 0.0899489497 0.04864950 1.000000000
## Page_Rank 0.022407265 -0.1471935455 0.13786020 0.030984366
## Google_Index -0.003519028 -0.0284714597 0.13750903 -0.012583857
## Links_pointing_to_page -0.140823687 0.0404074371 -0.31826599 -0.019859996
## Statistical_report 0.268417804 0.0091150444 0.13685990 0.009222875
## Result -0.003393524 0.1214964165 0.07571775 0.346103108
## Page_Rank Google_Index Links_pointing_to_page
## having_IP_Address -0.091773751 0.029152889 -0.339065107
## URL_Length 0.183517973 0.002902415 -0.022987416
## Shortining_Service 0.014591371 0.155843746 -0.198409770
## having_At_Symbol -0.064735176 0.037061311 -0.006080372
## double_slash_redirecting -0.003132332 0.178414903 -0.194164634
## Prefix_Suffix -0.006833928 0.067780621 0.067423361
## having_Sub_Domain 0.120730067 0.057672540 -0.010525803
## SSLfinal_State 0.074545009 0.096051392 -0.011710227
## Domain_registeration_length -0.059898164 -0.039765764 0.122671569
## Favicon 0.011699223 -0.016667711 -0.127243096
## port 0.017953719 -0.005413040 -0.139103955
## HTTPS_token 0.021104115 0.115449951 -0.128724152
## Request_URL 0.055734008 0.046408594 -0.067109020
## URL_of_Anchor 0.099260872 0.038816230 0.018650632
## Links_in_tags -0.006449921 0.045557482 0.013561478
## SFH 0.001978560 0.027587678 -0.009067706
## Submitting_to_email 0.026208360 -0.008377620 -0.039955727
## Abnormal_URL 0.007317677 0.124751201 -0.161026568
## Redirect 0.052866604 0.057230289 0.161277788
## on_mouseover 0.015633688 -0.006510003 -0.038551028
## RightClick 0.025341161 -0.008065574 -0.119831033
## popUpWidnow 0.017114154 -0.010255539 -0.121325462
## Iframe 0.022407265 -0.003519028 -0.140823687
## age_of_domain -0.147193545 -0.028471460 0.040407437
## DNSRecord 0.137860200 0.137509033 -0.318265990
## web_traffic 0.030984366 -0.012583857 -0.019859996
## Page_Rank 1.000000000 0.032431230 -0.028215509
## Google_Index 0.032431230 1.000000000 -0.038776791
## Links_pointing_to_page -0.028215509 -0.038776791 1.000000000
## Statistical_report 0.031048613 -0.005102843 -0.016817478
## Result 0.104644905 0.128950452 0.032573899
## Statistical_report Result
## having_IP_Address -0.019102515 9.416009e-02
## URL_Length -0.067153089 5.742963e-02
## Shortining_Service 0.085460758 -6.796589e-02
## having_At_Symbol -0.080356601 5.294779e-02
## double_slash_redirecting 0.070389942 -3.860761e-02
## Prefix_Suffix -0.002762565 3.486056e-01
## having_Sub_Domain 0.081627379 2.983233e-01
## SSLfinal_State 0.063410931 7.147412e-01
## Domain_registeration_length -0.002212040 -2.257895e-01
## Favicon 0.300917219 -2.795247e-04
## port 0.343986806 3.641885e-02
## HTTPS_token 0.096186734 -3.985390e-02
## Request_URL 0.035411775 2.533723e-01
## URL_of_Anchor 0.077376630 6.929345e-01
## Links_in_tags -0.087343254 2.482285e-01
## SFH -0.005288871 2.214190e-01
## Submitting_to_email 0.352073976 1.824901e-02
## Abnormal_URL 0.186399046 -6.048764e-02
## Redirect -0.059194241 -2.011346e-02
## on_mouseover 0.277346725 4.183844e-02
## RightClick 0.204409183 1.265323e-02
## popUpWidnow 0.285260615 8.588679e-05
## Iframe 0.268417804 -3.393524e-03
## age_of_domain 0.009115044 1.214964e-01
## DNSRecord 0.136859898 7.571775e-02
## web_traffic 0.009222875 3.461031e-01
## Page_Rank 0.031048613 1.046449e-01
## Google_Index -0.005102843 1.289505e-01
## Links_pointing_to_page -0.016817478 3.257390e-02
## Statistical_report 1.000000000 7.985672e-02
## Result 0.079856718 1.000000e+00
#Plot correlation matrix
corrplot(correlationMatrix, type = "upper",order = "hclust",col=brewer.pal(n=8,name= "RdYlBu"),tl.cex=0.5)
# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.8,verbose = TRUE)
## Compare row 22 and column 10 with corr 0.94
## Means: 0.188 vs 0.118 so flagging column 22
## Compare row 10 and column 11 with corr 0.804
## Means: 0.161 vs 0.114 so flagging column 10
## Compare row 5 and column 3 with corr 0.843
## Means: 0.182 vs 0.109 so flagging column 5
## All correlations <= 0.8
# print indexes of highly correlated attributes
print(highlyCorrelated)
## [1] 22 10 5
# From the output we can see that columns SSLfinal_State, URL_of_Anchor, web_traffic, having_Sub_Domain, Domain_registeration_length,Request_URL are highly correlated with Result variable
# Plotting the relationships between important features and target variable
# Plot Result vs SSLfinal_State
ggplot(df_phi, aes(x=SSLfinal_State, fill=Result)) + geom_bar(position="dodge")
# Plot Result vs URL_of_Anchor
ggplot(df_phi, aes(x=URL_of_Anchor, fill=Result)) + geom_bar(position="dodge")
# Plot Result vs web_traffic
ggplot(df_phi, aes(x=web_traffic, fill=Result)) + geom_bar(position="dodge")
# Plot Result vs having_Sub_Domain
ggplot(df_phi, aes(x=having_Sub_Domain, fill=Result)) + geom_bar(position="dodge")
# Plot Result vs Domain_registeration_length
ggplot(df_phi, aes(x=Domain_registeration_length, fill=Result)) + geom_bar(position="dodge")
# Plot Result vs Request_URL
ggplot(df_phi, aes(x=Request_URL, fill=Result)) + geom_bar(position="dodge")
df_phi$Result <- as.factor(df_phi$Result) # Converting the column to a factor variable
# Splitting the dataset into train and test using random sample
set.seed(1234)
sample <- sample(c(TRUE, FALSE), nrow(df_phi), replace=TRUE, prob=c(0.8,0.2))
phi_train <- df_phi[sample, ]
phi_test <- df_phi[!sample, ]
head(phi_train)
head(phi_test)
# constructing Decision tree
DT_phi <- rpart(Result ~ ., data=phi_train,parms = list(split="information") ,method="class")
summary(DT_phi)
## Call:
## rpart(formula = Result ~ ., data = phi_train, method = "class",
## parms = list(split = "information"))
## n= 8856
##
## CP nsplit rel error xerror xstd
## 1 0.74540347 0 1.0000000 1.0000000 0.011935025
## 2 0.03728294 1 0.2545965 0.2545965 0.007595734
## 3 0.01000000 2 0.2173136 0.2173136 0.007082457
##
## Variable importance
## SSLfinal_State URL_of_Anchor
## 44 31
## web_traffic having_Sub_Domain
## 9 7
## Domain_registeration_length Request_URL
## 5 4
##
## Node number 1: 8856 observations, complexity param=0.7454035
## predicted class=1 expected loss=0.4421861 P(node) =1
## class counts: 3916 4940
## probabilities: 0.442 0.558
## left son=2 (3795 obs) right son=3 (5061 obs)
## Primary splits:
## SSLfinal_State < 0.5 to the left, improve=2963.2510, (0 missing)
## URL_of_Anchor < -0.5 to the left, improve=2688.9690, (0 missing)
## Prefix_Suffix < 0 to the left, improve= 752.3226, (0 missing)
## web_traffic < 0.5 to the left, improve= 704.0302, (0 missing)
## having_Sub_Domain < 0.5 to the left, improve= 670.4705, (0 missing)
## Surrogate splits:
## URL_of_Anchor < -0.5 to the left, agree=0.821, adj=0.582, (0 split)
## web_traffic < 0.5 to the left, agree=0.657, adj=0.200, (0 split)
## having_Sub_Domain < 0.5 to the left, agree=0.643, adj=0.166, (0 split)
## Domain_registeration_length < 0 to the right, agree=0.619, adj=0.112, (0 split)
## Request_URL < 0 to the left, agree=0.614, adj=0.100, (0 split)
##
## Node number 2: 3795 observations
## predicted class=-1 expected loss=0.115415 P(node) =0.428523
## class counts: 3357 438
## probabilities: 0.885 0.115
##
## Node number 3: 5061 observations, complexity param=0.03728294
## predicted class=1 expected loss=0.1104525 P(node) =0.571477
## class counts: 559 4502
## probabilities: 0.110 0.890
## left son=6 (208 obs) right son=7 (4853 obs)
## Primary splits:
## URL_of_Anchor < -0.5 to the left, improve=333.3481, (0 missing)
## web_traffic < 0.5 to the left, improve=165.9506, (0 missing)
## Prefix_Suffix < 0 to the left, improve=143.7806, (0 missing)
## Links_in_tags < -0.5 to the left, improve=117.9490, (0 missing)
## having_Sub_Domain < 0.5 to the left, improve=114.6232, (0 missing)
##
## Node number 6: 208 observations
## predicted class=-1 expected loss=0.1490385 P(node) =0.0234869
## class counts: 177 31
## probabilities: 0.851 0.149
##
## Node number 7: 4853 observations
## predicted class=1 expected loss=0.0787142 P(node) =0.5479901
## class counts: 382 4471
## probabilities: 0.079 0.921
# Plotting decision tree using rpart.plot()
rpart.plot(DT_phi, main="Decision Tree for Website phishing")
# Feature evaluation of decision tree
phi_feature <- data.frame(imp = DT_phi$variable.importance)
phi_feature1 <- phi_feature %>%
tibble::rownames_to_column() %>%
dplyr::rename("variable" = rowname) %>%
dplyr::arrange(imp) %>%
dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(phi_feature1) +
geom_col(aes(x = variable, y = imp),
col = "black", show.legend = F) +
coord_flip() +
scale_fill_grey() +
theme_bw()
# The important features predicted from decision tree are SSLfinal_State, URL_of_Anchor, web_traffic, having_Sub_Domain, Domain_registeration_length,Request_URL
######################
# Predicting the model on train data
phi_predict_train <-predict(DT_phi, phi_train, type = 'class')
phi_table_train <- table(phi_train$Result, phi_predict_train)
phi_table_train
## phi_predict_train
## -1 1
## -1 3534 382
## 1 469 4471
# Predicting the model on test data
phi_predict_test <-predict(DT_phi, phi_test, type = 'class')
phi_table_test <- table(phi_test$Result, phi_predict_test)
phi_table_test
## phi_predict_test
## -1 1
## -1 891 91
## 1 94 1123
##################
# Confusion matrix to calculate the performance of the decision tree
#Confusion Matrix for train data
confusionMatrix(phi_table_train,reference = phi_train$Result)
## Confusion Matrix and Statistics
##
## phi_predict_train
## -1 1
## -1 3534 382
## 1 469 4471
##
## Accuracy : 0.9039
## 95% CI : (0.8976, 0.91)
## No Information Rate : 0.548
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8057
##
## Mcnemar's Test P-Value : 0.003198
##
## Sensitivity : 0.8828
## Specificity : 0.9213
## Pos Pred Value : 0.9025
## Neg Pred Value : 0.9051
## Prevalence : 0.4520
## Detection Rate : 0.3991
## Detection Prevalence : 0.4422
## Balanced Accuracy : 0.9021
##
## 'Positive' Class : -1
##
#Findings :
# 1. Accuracy of the decision tree on training data is 90.3%
# 2. Sensitivity of the decision tree on training data is 0.88
# 3. Specificity of the decision tree on training data is 0.92
#Confusion Matrix for test data
confusionMatrix(phi_table_test,reference = phi_test$Result)
## Confusion Matrix and Statistics
##
## phi_predict_test
## -1 1
## -1 891 91
## 1 94 1123
##
## Accuracy : 0.9159
## 95% CI : (0.9035, 0.9271)
## No Information Rate : 0.5521
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8298
##
## Mcnemar's Test P-Value : 0.8831
##
## Sensitivity : 0.9046
## Specificity : 0.9250
## Pos Pred Value : 0.9073
## Neg Pred Value : 0.9228
## Prevalence : 0.4479
## Detection Rate : 0.4052
## Detection Prevalence : 0.4466
## Balanced Accuracy : 0.9148
##
## 'Positive' Class : -1
##
#Findings :
# 1. Accuracy of the decision tree on test data is 91.56%
# 2. Sensitivity of the decision tree on test data is 0.90
# 3. Specificity of the decision tree on test data is 0.92
###################
# Cross validation of decision tree on test data
Validation_DT_model <- data.frame( R2 = R2(as.numeric(phi_predict_test), as.numeric(phi_test$Result)),
RMSE = RMSE(as.numeric(phi_predict_test), as.numeric(phi_test$Result)),
MAE = MAE(as.numeric(phi_predict_test), as.numeric(phi_test$Result)))
Validation_DT_model
# Findings :
# 1. R2 Error is 0.688
# 2. Mean Absolute Error (MAE) is 0.29
# 3. Root Mean Squared Error (RMSE) is 0.084
# Set a random seed
set.seed(51)
# Training the model using random forest model
rf_model <- randomForest(formula = Result ~ .,data= phi_train,ntree=1000,nodesize = 10)
rf_model
##
## Call:
## randomForest(formula = Result ~ ., data = phi_train, ntree = 1000, nodesize = 10)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 3.9%
## Confusion matrix:
## -1 1 class.error
## -1 3702 214 0.05464760
## 1 131 4809 0.02651822
# Predicting the Test set results
rf_pred = predict(rf_model, newdata = phi_test)
rf_table_test <- table(phi_test$Result, rf_pred)
# Confusion Matrix
confusionMatrix(rf_table_test,reference = phi_test$Result)
## Confusion Matrix and Statistics
##
## rf_pred
## -1 1
## -1 939 43
## 1 32 1185
##
## Accuracy : 0.9659
## 95% CI : (0.9574, 0.9731)
## No Information Rate : 0.5584
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9309
##
## Mcnemar's Test P-Value : 0.2482
##
## Sensitivity : 0.9670
## Specificity : 0.9650
## Pos Pred Value : 0.9562
## Neg Pred Value : 0.9737
## Prevalence : 0.4416
## Detection Rate : 0.4270
## Detection Prevalence : 0.4466
## Balanced Accuracy : 0.9660
##
## 'Positive' Class : -1
##
#Findings :
# 1. Accuracy of the random forest model on test data is 96.63%
# 2. Sensitivity of the random forest model on test data is 0.967
# 3. Specificity of the random forest model on test data is 0.956
plot(rf_model)
# Importance plot
importance(rf_model)
## MeanDecreaseGini
## having_IP_Address 34.949481
## URL_Length 25.041544
## Shortining_Service 16.476935
## having_At_Symbol 11.745501
## double_slash_redirecting 12.622672
## Prefix_Suffix 175.333063
## having_Sub_Domain 222.216222
## SSLfinal_State 1357.679219
## Domain_registeration_length 52.152567
## Favicon 13.330742
## port 8.198116
## HTTPS_token 17.598783
## Request_URL 63.905924
## URL_of_Anchor 1025.334523
## Links_in_tags 141.077161
## SFH 69.485551
## Submitting_to_email 14.958272
## Abnormal_URL 13.813237
## Redirect 15.960494
## on_mouseover 10.335438
## RightClick 3.527376
## popUpWidnow 15.436528
## Iframe 7.147825
## age_of_domain 34.651000
## DNSRecord 37.043953
## web_traffic 269.737583
## Page_Rank 29.464816
## Google_Index 38.460288
## Links_pointing_to_page 50.621932
## Statistical_report 12.634788
# Variable importance plot
varImpPlot(rf_model)
# Cross validation
Validation_rf_model <- data.frame( R2 = R2(as.numeric(rf_pred), as.numeric(phi_test$Result)),
RMSE = RMSE(as.numeric(rf_pred), as.numeric(phi_test$Result)),
MAE = MAE(as.numeric(rf_pred), as.numeric(phi_test$Result)))
Validation_rf_model
# Findings :
# 1. R2 Error is 0.868 high
# 2. Mean Absolute Error (MAE) is 0.183 Low
# 3. Root Mean Squared Error (RMSE) is 0.0336 Low